module namespace cmd-model = "http://spraakbanken.gu.se/clarin/xquery/model"; (: $Id$ :) import module namespace xdb="http://exist-db.org/xquery/xmldb"; import module namespace util="http://exist-db.org/xquery/util"; declare variable $cmd-model:cmdiDatabaseURI as xs:string := "xmldb:exist:///db"; declare variable $cmd-model:cmdiMirrorPath as xs:string := "/db/cmdi-mirror"; declare variable $cmd-model:cachePath as xs:string := "/db/cache"; declare variable $cmd-model:groupXsl := doc('/db/clarin/group.xsl'); declare variable $cmd-model:getCollections as xs:string := "getCollections"; declare variable $cmd-model:queryModel as xs:string := "queryModel"; declare variable $cmd-model:scanIndex as xs:string := "scanIndex"; declare variable $cmd-model:searchRetrieve as xs:string := "searchRetrieve"; declare variable $cmd-model:typeActorPath as xs:string := "MDGroup/Actors/Actor"; declare variable $cmd-model:typeActorPath0 as xs:string := "Actor"; declare variable $cmd-model:typeActorRolePath as xs:string := "MDGroup/Actors/Actor/Role"; declare variable $cmd-model:docTypeTerms as xs:string := "Terms"; declare variable $cmd-model:docTypeSuffix as xs:string := "Values"; declare variable $cmd-model:responseFormatXml as xs:string := "xml"; declare variable $cmd-model:responseFormatJSon as xs:string := "json"; declare variable $cmd-model:responseFormatText as xs:string := "text"; declare variable $cmd-model:scanSortText as xs:string := "text"; declare variable $cmd-model:scanSortSize as xs:string := "size"; declare variable $cmd-model:collectionDocName as xs:string := "collection.xml"; declare variable $cmd-model:collectionRoot as xs:string := "root"; declare variable $cmd-model:xmlExt as xs:string := ".xml"; declare variable $cmd-model:maxDepth as xs:integer := 8; declare variable $cmd-model:valuesLimit as xs:integer := 100; (:~ API function getCollections. :) declare function cmd-model:get-collections($collections as xs:string+, $format as xs:string, $max-depth as xs:integer) as item() { let $name := cmd-model:gen-cache-id("collection", $collections, xs:string($max-depth)), $doc := if (cmd-model:is-in-cache($name)) then cmd-model:get-from-cache($name) else let $data := cmd-model:colls($collections, $max-depth) return cmd-model:store-in-cache($name, $data) return cmd-model:serialise-as($doc, $format) }; (:~ API function queryModel. :) declare function cmd-model:query-model($cmd-index-path as xs:string, $collection as xs:string+, $format as xs:string, $max-depth as xs:integer) as item()? { let $name := cmd-model:gen-cache-id("model", ($collection, $cmd-index-path), xs:string($max-depth)), $doc := if (cmd-model:is-in-cache($name)) then cmd-model:get-from-cache($name) else let $data := cmd-model:elem($collection, $cmd-index-path, $max-depth) return cmd-model:store-in-cache($name, $data) return cmd-model:serialise-as($doc, $format) }; (:~ API function scanIndex. two phases: 1.one create full index for given path/element (and cache) 2. select wished subsequence (on second call, only the second step is performed) :) declare function cmd-model:scan-index($q as xs:string, $collection as xs:string+, $format as xs:string, $start-item as xs:integer, $max-items as xs:integer, $p-sort as xs:string?) as item()? { let $qa := tokenize($q,'='), $cmd-index-path := $qa[1], $filter := ($qa[2],'')[1], $sort := if ($p-sort eq $cmd-model:scanSortText or $p-sort eq $cmd-model:scanSortSize) then $p-sort else $cmd-model:scanSortText, $name := cmd-model:gen-cache-id("index", ($collection, $cmd-index-path),"1"), (: skip cache $doc := cmd-model:values($cmd-index-path, $collection) :) $doc := if (cmd-model:is-in-cache($name)) then cmd-model:get-from-cache($name) else let $data := cmd-model:values($cmd-index-path, $collection) return cmd-model:store-in-cache($name, $data) (: extract the required subsequence (according to given sort) :) let $res-term := transform:transform($doc,$cmd-model:groupXsl, ), $count-items := count($res-term/v), $colls := if (fn:empty($collection)) then '' else fn:string-join($collection, ","), $created := fn:current-dateTime(), $scan-clause := concat($cmd-index-path, '=', $filter), $res := {$res-term} (: let $result-count := $doc/Term/@count, $result-seq := fn:subsequence($doc/Term/v, $start-item, $end-item), $result-frag := ($doc/Term, $result-seq), $seq-count := fn:count($result-seq) :) return cmd-model:serialise-as($res, $format) }; (:~ API function searchRetrieve. :) declare function cmd-model:search-retrieve($xpath-query as xs:string, $collections as xs:string+, $format as xs:string, $start-item as xs:integer, $end-item as xs:integer) as item()* { let $start-time := util:system-dateTime(), $collection := collection($cmd-model:cmdiMirrorPath), $decoded-query := xdb:decode($xpath-query), $sanitized-query := cmd-model:sanitize-query($decoded-query), $results := if ($collections[1] eq $cmd-model:collectionRoot) then util:eval(fn:concat("$collection", $sanitized-query, "/ancestor-or-self::CMD")) else for $coll in $collections return util:eval(fn:concat("$collection/ft:query(descendant::IsPartOf, ", xdb:decode($coll) ,")/ancestor-or-self::CMD", $sanitized-query)) let $result-count := fn:count($results), $result-seq := fn:subsequence($results, $start-item, $end-item), $seq-count := fn:count($result-seq), $end-time := util:system-dateTime() let $summary-fragment := if (contains($format,'withSummary')) then let $used-profiles := for $profile in distinct-values($results//Components/concat(child::element()/name(),'##',../Header/MdProfile)) let $profile-id := substring-after($profile,'##'), $profile-name := substring-before($profile,'##') return , $end-time2 := util:system-dateTime(), $result-summary := cmd-model:elem-r($result-seq//Components, "Components", $cmd-model:maxDepth, $cmd-model:maxDepth), $end-time3 := util:system-dateTime(), $duration := concat(($end-time - $start-time),", ", ($end-time2 - $start-time),", ", ($end-time3 - $start-time)) return ({$duration}, {$used-profiles},{$result-summary}) else {$end-time - $start-time} let $result-fragment := {$result-count} {if ($decoded-query ne $sanitized-query) then concat("Rewritten to '", $sanitized-query, "'") else $xpath-query, $collections, $start-item, $end-item} {$seq-count} {$summary-fragment} {$result-seq} return cmd-model:serialise-as($result-fragment, $format) }; (: ********************** queryModel, scanIndex - subfunctions :) declare function cmd-model:sanitize-query($query as xs:string) as xs:string { let $last-segment := text:groups($query, "/([^/]+)$")[last()] return if ($query = ("//*", "descendant::element()")) then "" else if ($last-segment = ("Title", "Name", "Role", "Genre", "Country", "Continent", "MdSelfLink", "IsPartOf")) then (: concat("ft:query(",:) if ($query eq concat("//", $last-segment)) then concat("[descendant::", $last-segment, "]") else concat("[", $query, "]") (:, ", .*)") :) else $query }; declare function cmd-model:elem($collections as xs:string+, $path as xs:string, $depth as xs:integer) as element() { let $collection := collection($cmd-model:cmdiMirrorPath), $path-nodes := if ($collections[1] eq $cmd-model:collectionRoot) then util:eval(fn:concat("$collection/descendant-or-self::", $path)) else for $coll in $collections return util:eval(fn:concat("$collection/ft:query(descendant::IsPartOf, ", xdb:decode($coll), ")/ancestor-or-self::CMD/descendant-or-self::", $path)) let $entries := cmd-model:elem-r($path-nodes, $path, $depth, $depth), $coll-names-value := if (fn:empty($collections)) then () else attribute colls {fn:string-join($collections, ",")}, $result := element {$cmd-model:docTypeTerms} { $coll-names-value, attribute depth {$depth}, attribute created {fn:current-dateTime()}, $entries } return $result }; declare function cmd-model:elem-r($path-nodes as node()*, $path as xs:string, $max-depth as xs:integer, $depth as xs:integer) as element() { let $path-count := count($path-nodes), $child-elements := $path-nodes/child::element(), $subs := distinct-values($child-elements/name()), $nodes-child-terminal := if (empty($child-elements)) then $path-nodes else () (: Maybe some selected elements $child-elements[not(element())] later on :), $text-nodes := $nodes-child-terminal/text(), $text-count := count($text-nodes), $text-count-distinct := count(distinct-values($text-nodes)) return (: { :) { if ($depth > 0) then (for $elname in $subs[. != ''] return cmd-model:elem-r(util:eval(concat("$path-nodes/", $elname)), concat($path, '/', $elname), $max-depth, $depth - 1) (: values moved to own function: scanIndex if ($max-depth eq 1 and $text-count gt 0) then cmd-model:values($path-nodes) else ()) :) ) else 'maxdepth' } }; declare function cmd-model:paths($n) { for $el in $n return { for $anc in $el/parent::element() return util:node-xpath($anc) } }; declare function cmd-model:collect-nodes($collections as xs:string+, $path as xs:string) as element()* { let $collection := collection($cmd-model:cmdiMirrorPath), $path-nodes := if ($collections[1] eq $cmd-model:collectionRoot) then util:eval(fn:concat("$collection/descendant-or-self::", $path)) else for $coll in $collections return util:eval(fn:concat("$collection/ft:query(descendant::IsPartOf, ", xdb:decode($coll), ")/ancestor-or-self::CMD/descendant-or-self::", $path)) return $path-nodes }; declare function cmd-model:values($path as xs:string,$collections as xs:string+) as element() { let $nodes := cmd-model:collect-nodes($collections, $path), (: $term := {$nodes} @name is added in xslt:) $term := {$nodes} (: use XSLT-2.0 for-each-group functionality to aggregate the values of a node - much, much faster, than XQuery :) return transform:transform($term,$cmd-model:groupXsl, ()) }; (: ********************** getCollections - subfunctions :) declare function cmd-model:colls($collections as xs:string+, $max-depth as xs:integer) as element() { let $children := for $collection-item in $collections return for $collection-doc in cmd-model:get-resource-by-handle($collection-item) return cmd-model:colls-r($collection-doc, cmd-model:get-md-collection-name($collection-doc), $collection-doc//MdSelfLink, "", $max-depth) let $res-count := sum($children/@cnt) let $coll-count := sum($children/@cnt_subcolls) + count($children) let $data := {$children} return $data }; (: Recurse down in collections. :) declare function cmd-model:colls-r($collection as node(), $name as xs:string, $handle as xs:string, $proxy-id as xs:string, $depth as xs:integer) as item()* { let $children := if ($depth eq 1) then () else cmd-model:get-children-colls($collection) (: let $dummy := util:log('debug', fn:concat(cmd-model:get-md-collection-name($collection), " ", $collection//MdSelfLink, " ", xs:string($depth), " CHILDREN = ", string-join(for $child in $children return $child//MdSelfLink, "#"))) :) return if (fn:exists($children)) then let $child-results := for $child in $children (: let $child-doc := if (empty($child/unresolvable-uri)) then cmd-model:get-resource-by-handle($child/ResourceRef) else (), :) let $child-name := cmd-model:get-md-collection-name($child) let $proxyid := ($collection//ResourceProxy[ResourceRef = $child//MdSelfLink]/@id, concat("UNKNOWN proxy id:", $child//MdSelfLink))[1] return cmd-model:colls-r($child, $child-name, $child//Header/MdSelfLink, $proxyid, $depth - 1) return {$child-results} else }; (: Get the MD resource by handle. :) declare function cmd-model:get-resource-by-handle($id as xs:string) as node()* { let $collection := collection($cmd-model:cmdiMirrorPath) return if ($id eq "" or $id eq $cmd-model:collectionRoot) then $collection//IsPartOf[. = $cmd-model:collectionRoot]/ancestor::CMD else util:eval(concat("$collection/ft:query(descendant::MdSelfLink, ", xdb:decode($id), ")/ancestor::CMD")) (: $collection/descendant::MdSelfLink[. = xdb:decode($id)]/ancestor::CMD :) }; (: Get the next level collection-records (ResourceType='Metadata') rely on the ResourceProxy of the parent (param) :) declare function cmd-model:get-children-colls($collection as node()) as node()* { let $handle := $collection//MdSelfLink/text(), $cmdi-collection := collection($cmd-model:cmdiMirrorPath) return util:eval(concat("$cmdi-collection/ft:query(descendant::IsPartOf, ", $handle, ")/ancestor::CMD[descendant::ResourceType[. = 'Metadata']]")) (: collection($cmd-model:cmdiMirrorPath)/descendant::IsPartOf[. eq $handle]/ancestor::CMD[descendant::ResourceType[. = "Metadata"]] :) }; (: count ALL (independent of maxDepth) resource-records (ie actually ResourceType=Resource, but there are records without ResourceProxy[ResourceType=Resource] - so care for that (not(exists((ResourceType)))) :) declare function cmd-model:get-resource-count($handle as xs:string) as xs:string { (: xs:string(count(collection($cmd-model:cmdiMirrorPath)//IsPartOf[. eq $handle]/ancestor::CMD[descendant::ResourceType[. = "Resource"] or not(exists(descendant::ResourceType)) ])):) xs:string(count(collection($cmd-model:cmdiMirrorPath)//IsPartOf[. eq $handle]/ancestor::CMD[not(descendant::ResourceType eq 'Metadata') ])) }; (: This is complement to cmd-model:get-resource-count() count ALL (independent of maxDepth) collection-records (ie ResourceType=Metadata) :) declare function cmd-model:get-collection-count($handle as xs:string) as xs:string { xs:string(count(collection($cmd-model:cmdiMirrorPath)//IsPartOf[. eq $handle]/ancestor::CMD[descendant::ResourceType[. = "Metadata"]])) }; (: Try to derive a name from the collection-record (more-or-less agnostic about the actual schema. :) declare function cmd-model:get-md-collection-name($collection-doc as node()) as xs:string { ($collection-doc//Corpus/Name, $collection-doc//Session/Name, $collection-doc//Collection/GeneralInfo/Name, $collection-doc//Collection/GeneralInfo/Title, $collection-doc//Name, $collection-doc//name, $collection-doc//Title, $collection-doc//title, "UNKNOWN")[1] }; (: *********************** HELPER function - dealing with caching the results :) (: Function for telling wether the document is available or not. generic, currently not used :) declare function cmd-model:is-doc-available($collection as xs:string, $doc-name as xs:string) as xs:boolean { fn:doc-available(fn:concat($collection, "/", $doc-name)) }; declare function cmd-model:is-in-cache($doc-name as xs:string) as xs:boolean { fn:doc-available(fn:concat($cmd-model:cachePath, "/", $doc-name)) }; declare function cmd-model:get-from-cache($doc-name as xs:string) as item()* { fn:doc(fn:concat($cmd-model:cachePath, "/", $doc-name)) }; (: Store the collection listing for given collection. :) declare function cmd-model:store-in-cache($doc-name as xs:string, $data as node()) as item()* { let $clarin-writer := fn:doc("/db/clarin/writer.xml"), $dummy := xdb:login($cmd-model:cachePath, $clarin-writer//write-user/text(), $clarin-writer//write-user-cred/text()) let $store := (: util:catch("org.exist.xquery.XPathException", :) xdb:store($cmd-model:cachePath, $doc-name, $data), (: , ()) :) $stored-doc := fn:doc(concat($cmd-model:cachePath, "/", $doc-name)) return $stored-doc }; (: Create document name with md5-hash for selected collections (or types) for reuse. :) declare function cmd-model:gen-cache-id($type-name as xs:string, $keys as xs:string+, $depth as xs:string) as xs:string { let $name-prefix := fn:concat($type-name, $depth), $sorted-names := for $key in $keys order by $key ascending return $key return fn:concat($name-prefix, "-", util:hash(string-join($sorted-names, ""), "MD5"), $cmd-model:xmlExt) }; (: Seraliseringsformat. :) declare function cmd-model:serialise-as($item as node()?, $format as xs:string) as item()? { if ($format eq $cmd-model:responseFormatJSon) then let $option := util:declare-option("exist:serialize", "method=text media-type=application/json") return (: json:xml-to-json($item) :) $item else (: $cmd-model:responseFormatXml, $cmd-model:responseFormatText:) $item };