module namespace cmd-model = "http://spraakbanken.gu.se/clarin/xquery/model";
(:
$Id$
:)
import module namespace xdb="http://exist-db.org/xquery/xmldb";
import module namespace util="http://exist-db.org/xquery/util";
declare variable $cmd-model:cmdiDatabaseURI as xs:string := "xmldb:exist:///db";
declare variable $cmd-model:cmdiMirrorPath as xs:string := "/db/cmdi-mirror";
declare variable $cmd-model:cachePath as xs:string := "/db/cache";
declare variable $cmd-model:groupXsl := doc('/db/clarin/group.xsl');
declare variable $cmd-model:getCollections as xs:string := "getCollections";
declare variable $cmd-model:queryModel as xs:string := "queryModel";
declare variable $cmd-model:scanIndex as xs:string := "scanIndex";
declare variable $cmd-model:searchRetrieve as xs:string := "searchRetrieve";
declare variable $cmd-model:typeActorPath as xs:string := "MDGroup/Actors/Actor";
declare variable $cmd-model:typeActorPath0 as xs:string := "Actor";
declare variable $cmd-model:typeActorRolePath as xs:string := "MDGroup/Actors/Actor/Role";
declare variable $cmd-model:docTypeTerms as xs:string := "Terms";
declare variable $cmd-model:docTypeSuffix as xs:string := "Values";
declare variable $cmd-model:responseFormatXml as xs:string := "xml";
declare variable $cmd-model:responseFormatJSon as xs:string := "json";
declare variable $cmd-model:responseFormatText as xs:string := "text";
declare variable $cmd-model:scanSortText as xs:string := "text";
declare variable $cmd-model:scanSortSize as xs:string := "size";
declare variable $cmd-model:collectionDocName as xs:string := "collection.xml";
declare variable $cmd-model:collectionRoot as xs:string := "root";
declare variable $cmd-model:xmlExt as xs:string := ".xml";
declare variable $cmd-model:maxDepth as xs:integer := 8;
declare variable $cmd-model:valuesLimit as xs:integer := 100;
(:~
API function getCollections.
:)
declare function cmd-model:get-collections($collections as xs:string+, $format as xs:string, $max-depth as xs:integer) as item() {
let $name := cmd-model:gen-cache-id("collection", $collections, xs:string($max-depth)),
$doc :=
if (cmd-model:is-in-cache($name)) then
cmd-model:get-from-cache($name)
else
let $data := cmd-model:colls($collections, $max-depth)
return cmd-model:store-in-cache($name, $data)
return
cmd-model:serialise-as($doc, $format)
};
(:~
API function queryModel.
:)
declare function cmd-model:query-model($cmd-index-path as xs:string, $collection as xs:string+, $format as xs:string, $max-depth as xs:integer) as item()? {
let $name := cmd-model:gen-cache-id("model", ($collection, $cmd-index-path), xs:string($max-depth)),
$doc :=
if (cmd-model:is-in-cache($name)) then
cmd-model:get-from-cache($name)
else
let $data := cmd-model:elem($collection, $cmd-index-path, $max-depth)
return cmd-model:store-in-cache($name, $data)
return
cmd-model:serialise-as($doc, $format)
};
(:~
API function scanIndex.
two phases:
1.one create full index for given path/element (and cache)
2. select wished subsequence (on second call, only the second step is performed)
:)
declare function cmd-model:scan-index($q as xs:string, $collection as xs:string+, $format as xs:string, $start-item as xs:integer, $max-items as xs:integer, $p-sort as xs:string?) as item()? {
let $qa := tokenize($q,'='),
$cmd-index-path := $qa[1],
$filter := ($qa[2],'')[1],
$sort := if ($p-sort eq $cmd-model:scanSortText or $p-sort eq $cmd-model:scanSortSize) then $p-sort else $cmd-model:scanSortText,
$name := cmd-model:gen-cache-id("index", ($collection, $cmd-index-path),"1"),
(: skip cache $doc := cmd-model:values($cmd-index-path, $collection) :)
$doc := if (cmd-model:is-in-cache($name)) then
cmd-model:get-from-cache($name)
else
let $data := cmd-model:values($cmd-index-path, $collection)
return cmd-model:store-in-cache($name, $data)
(: extract the required subsequence (according to given sort) :)
let $res-term := transform:transform($doc,$cmd-model:groupXsl,
),
$count-items := count($res-term/v),
$colls := if (fn:empty($collection)) then '' else fn:string-join($collection, ","),
$created := fn:current-dateTime(),
$scan-clause := concat($cmd-index-path, '=', $filter),
$res := {$res-term}
(: let $result-count := $doc/Term/@count,
$result-seq := fn:subsequence($doc/Term/v, $start-item, $end-item),
$result-frag := ($doc/Term, $result-seq),
$seq-count := fn:count($result-seq) :)
return
cmd-model:serialise-as($res, $format)
};
(:~
API function searchRetrieve.
:)
declare function cmd-model:search-retrieve($xpath-query as xs:string, $collections as xs:string+, $format as xs:string, $start-item as xs:integer, $end-item as xs:integer) as item()* {
let $start-time := util:system-dateTime(),
$collection := collection($cmd-model:cmdiMirrorPath),
$decoded-query := xdb:decode($xpath-query),
$sanitized-query := cmd-model:sanitize-query($decoded-query),
$results :=
if ($collections[1] eq $cmd-model:collectionRoot) then
util:eval(fn:concat("$collection", $sanitized-query, "/ancestor-or-self::CMD"))
else
for $coll in $collections return util:eval(fn:concat("$collection/ft:query(descendant::IsPartOf, ", xdb:decode($coll) ,")/ancestor-or-self::CMD", $sanitized-query))
let $result-count := fn:count($results),
$result-seq := fn:subsequence($results, $start-item, $end-item),
$seq-count := fn:count($result-seq),
$end-time := util:system-dateTime()
let $summary-fragment :=
if (contains($format,'withSummary')) then
let $used-profiles := for $profile in distinct-values($results//Components/concat(child::element()/name(),'##',../Header/MdProfile))
let $profile-id := substring-after($profile,'##'), $profile-name := substring-before($profile,'##')
return ,
$end-time2 := util:system-dateTime(),
$result-summary := cmd-model:elem-r($result-seq//Components, "Components", $cmd-model:maxDepth, $cmd-model:maxDepth),
$end-time3 := util:system-dateTime(),
$duration := concat(($end-time - $start-time),", ", ($end-time2 - $start-time),", ", ($end-time3 - $start-time))
return ({$duration}, {$used-profiles},{$result-summary})
else {$end-time - $start-time}
let $result-fragment :=
{$result-count}
{if ($decoded-query ne $sanitized-query) then concat("Rewritten to '", $sanitized-query, "'") else $xpath-query, $collections, $start-item, $end-item}
{$seq-count}
{$summary-fragment}
{$result-seq}
return
cmd-model:serialise-as($result-fragment, $format)
};
(:
**********************
queryModel, scanIndex - subfunctions
:)
declare function cmd-model:sanitize-query($query as xs:string) as xs:string {
let $last-segment := text:groups($query, "/([^/]+)$")[last()]
return
if ($query = ("//*", "descendant::element()")) then
""
else if ($last-segment = ("Title", "Name", "Role", "Genre", "Country", "Continent", "MdSelfLink", "IsPartOf")) then
(: concat("ft:query(",:) if ($query eq concat("//", $last-segment)) then concat("[descendant::", $last-segment, "]") else concat("[", $query, "]") (:, ", .*)") :)
else $query
};
declare function cmd-model:elem($collections as xs:string+, $path as xs:string, $depth as xs:integer) as element() {
let $collection := collection($cmd-model:cmdiMirrorPath),
$path-nodes :=
if ($collections[1] eq $cmd-model:collectionRoot) then
util:eval(fn:concat("$collection/descendant-or-self::", $path))
else
for $coll in $collections
return
util:eval(fn:concat("$collection/ft:query(descendant::IsPartOf, ", xdb:decode($coll), ")/ancestor-or-self::CMD/descendant-or-self::", $path))
let $entries := cmd-model:elem-r($path-nodes, $path, $depth, $depth),
$coll-names-value := if (fn:empty($collections)) then () else attribute colls {fn:string-join($collections, ",")},
$result := element {$cmd-model:docTypeTerms} {
$coll-names-value,
attribute depth {$depth},
attribute created {fn:current-dateTime()},
$entries
}
return $result
};
declare function cmd-model:elem-r($path-nodes as node()*, $path as xs:string, $max-depth as xs:integer, $depth as xs:integer) as element() {
let $path-count := count($path-nodes),
$child-elements := $path-nodes/child::element(),
$subs := distinct-values($child-elements/name()),
$nodes-child-terminal := if (empty($child-elements)) then $path-nodes else () (: Maybe some selected elements $child-elements[not(element())] later on :),
$text-nodes := $nodes-child-terminal/text(),
$text-count := count($text-nodes),
$text-count-distinct := count(distinct-values($text-nodes))
return
(: { :)
{
if ($depth > 0) then
(for $elname in $subs[. != '']
return
cmd-model:elem-r(util:eval(concat("$path-nodes/", $elname)), concat($path, '/', $elname), $max-depth, $depth - 1)
(: values moved to own function: scanIndex
if ($max-depth eq 1 and $text-count gt 0) then cmd-model:values($path-nodes) else ()) :)
)
else 'maxdepth'
}
};
declare function cmd-model:paths($n) {
for $el in $n
return {
for $anc in $el/parent::element()
return util:node-xpath($anc)
}
};
declare function cmd-model:collect-nodes($collections as xs:string+, $path as xs:string) as element()* {
let $collection := collection($cmd-model:cmdiMirrorPath),
$path-nodes :=
if ($collections[1] eq $cmd-model:collectionRoot) then
util:eval(fn:concat("$collection/descendant-or-self::", $path))
else
for $coll in $collections
return
util:eval(fn:concat("$collection/ft:query(descendant::IsPartOf, ", xdb:decode($coll), ")/ancestor-or-self::CMD/descendant-or-self::", $path))
return $path-nodes
};
declare function cmd-model:values($path as xs:string,$collections as xs:string+) as element() {
let $nodes := cmd-model:collect-nodes($collections, $path),
(: $term := {$nodes}
@name is added in xslt:)
$term := {$nodes}
(: use XSLT-2.0 for-each-group functionality to aggregate the values of a node - much, much faster, than XQuery :)
return transform:transform($term,$cmd-model:groupXsl, ())
};
(:
**********************
getCollections - subfunctions
:)
declare function cmd-model:colls($collections as xs:string+, $max-depth as xs:integer) as element() {
let $children :=
for $collection-item in $collections
return
for $collection-doc in cmd-model:get-resource-by-handle($collection-item)
return cmd-model:colls-r($collection-doc, cmd-model:get-md-collection-name($collection-doc), $collection-doc//MdSelfLink, "", $max-depth)
let $res-count := sum($children/@cnt)
let $coll-count := sum($children/@cnt_subcolls) + count($children)
let $data := {$children}
return $data
};
(:
Recurse down in collections.
:)
declare function cmd-model:colls-r($collection as node(), $name as xs:string, $handle as xs:string, $proxy-id as xs:string, $depth as xs:integer) as item()* {
let $children := if ($depth eq 1) then () else cmd-model:get-children-colls($collection)
(: let $dummy := util:log('debug', fn:concat(cmd-model:get-md-collection-name($collection), " ", $collection//MdSelfLink, " ", xs:string($depth), " CHILDREN = ", string-join(for $child in $children return $child//MdSelfLink, "#"))) :)
return
if (fn:exists($children)) then
let $child-results :=
for $child in $children
(: let $child-doc := if (empty($child/unresolvable-uri)) then
cmd-model:get-resource-by-handle($child/ResourceRef) else (), :)
let $child-name := cmd-model:get-md-collection-name($child)
let $proxyid := ($collection//ResourceProxy[ResourceRef = $child//MdSelfLink]/@id, concat("UNKNOWN proxy id:", $child//MdSelfLink))[1]
return
cmd-model:colls-r($child, $child-name, $child//Header/MdSelfLink, $proxyid, $depth - 1)
return
{$child-results}
else
};
(:
Get the MD resource by handle.
:)
declare function cmd-model:get-resource-by-handle($id as xs:string) as node()* {
let $collection := collection($cmd-model:cmdiMirrorPath)
return
if ($id eq "" or $id eq $cmd-model:collectionRoot) then
$collection//IsPartOf[. = $cmd-model:collectionRoot]/ancestor::CMD
else
util:eval(concat("$collection/ft:query(descendant::MdSelfLink, ", xdb:decode($id), ")/ancestor::CMD"))
(: $collection/descendant::MdSelfLink[. = xdb:decode($id)]/ancestor::CMD :)
};
(:
Get the next level collection-records (ResourceType='Metadata')
rely on the ResourceProxy of the parent (param)
:)
declare function cmd-model:get-children-colls($collection as node()) as node()* {
let $handle := $collection//MdSelfLink/text(),
$cmdi-collection := collection($cmd-model:cmdiMirrorPath)
return util:eval(concat("$cmdi-collection/ft:query(descendant::IsPartOf, ", $handle, ")/ancestor::CMD[descendant::ResourceType[. = 'Metadata']]"))
(: collection($cmd-model:cmdiMirrorPath)/descendant::IsPartOf[. eq $handle]/ancestor::CMD[descendant::ResourceType[. = "Metadata"]] :)
};
(:
count ALL (independent of maxDepth) resource-records (ie actually ResourceType=Resource, but
there are records without ResourceProxy[ResourceType=Resource] -
so care for that (not(exists((ResourceType))))
:)
declare function cmd-model:get-resource-count($handle as xs:string) as xs:string {
(: xs:string(count(collection($cmd-model:cmdiMirrorPath)//IsPartOf[. eq $handle]/ancestor::CMD[descendant::ResourceType[. = "Resource"] or not(exists(descendant::ResourceType)) ])):)
xs:string(count(collection($cmd-model:cmdiMirrorPath)//IsPartOf[. eq $handle]/ancestor::CMD[not(descendant::ResourceType eq 'Metadata') ]))
};
(:
This is complement to cmd-model:get-resource-count()
count ALL (independent of maxDepth) collection-records
(ie ResourceType=Metadata)
:)
declare function cmd-model:get-collection-count($handle as xs:string) as xs:string {
xs:string(count(collection($cmd-model:cmdiMirrorPath)//IsPartOf[. eq $handle]/ancestor::CMD[descendant::ResourceType[. = "Metadata"]]))
};
(:
Try to derive a name from the collection-record (more-or-less agnostic about
the actual schema.
:)
declare function cmd-model:get-md-collection-name($collection-doc as node()) as xs:string {
($collection-doc//Corpus/Name, $collection-doc//Session/Name, $collection-doc//Collection/GeneralInfo/Name, $collection-doc//Collection/GeneralInfo/Title, $collection-doc//Name, $collection-doc//name, $collection-doc//Title, $collection-doc//title, "UNKNOWN")[1]
};
(:
***********************
HELPER function - dealing with caching the results
:)
(:
Function for telling wether the document is available or not.
generic, currently not used
:)
declare function cmd-model:is-doc-available($collection as xs:string, $doc-name as xs:string) as xs:boolean {
fn:doc-available(fn:concat($collection, "/", $doc-name))
};
declare function cmd-model:is-in-cache($doc-name as xs:string) as xs:boolean {
fn:doc-available(fn:concat($cmd-model:cachePath, "/", $doc-name))
};
declare function cmd-model:get-from-cache($doc-name as xs:string) as item()* {
fn:doc(fn:concat($cmd-model:cachePath, "/", $doc-name))
};
(:
Store the collection listing for given collection.
:)
declare function cmd-model:store-in-cache($doc-name as xs:string, $data as node()) as item()* {
let $clarin-writer := fn:doc("/db/clarin/writer.xml"),
$dummy := xdb:login($cmd-model:cachePath, $clarin-writer//write-user/text(), $clarin-writer//write-user-cred/text())
let $store := (: util:catch("org.exist.xquery.XPathException", :) xdb:store($cmd-model:cachePath, $doc-name, $data), (: , ()) :)
$stored-doc := fn:doc(concat($cmd-model:cachePath, "/", $doc-name))
return $stored-doc
};
(:
Create document name with md5-hash for selected collections (or types)
for reuse.
:)
declare function cmd-model:gen-cache-id($type-name as xs:string, $keys as xs:string+, $depth as xs:string) as xs:string {
let $name-prefix := fn:concat($type-name, $depth),
$sorted-names := for $key in $keys order by $key ascending return $key
return
fn:concat($name-prefix, "-", util:hash(string-join($sorted-names, ""), "MD5"), $cmd-model:xmlExt)
};
(:
Seraliseringsformat.
:)
declare function cmd-model:serialise-as($item as node()?, $format as xs:string) as item()? {
if ($format eq $cmd-model:responseFormatJSon) then
let $option := util:declare-option("exist:serialize", "method=text media-type=application/json")
return
(: json:xml-to-json($item) :) $item
else (: $cmd-model:responseFormatXml, $cmd-model:responseFormatText:)
$item
};