Source code for decomp.semantics.uds.corpus

"""Module for representing UDS corpora with sentence and document collections.

This module provides the UDSCorpus class for managing collections of Universal
Decompositional Semantics (UDS) graphs at both sentence and document levels.
It includes:

- Loading corpora from various formats (CoNLL, JSON)
- Managing sentence-level and document-level graphs
- Adding annotations to existing graphs
- Querying graphs using SPARQL
- Serialization and deserialization functionality

The UDSCorpus extends PredPattCorpus to support UDS-specific annotations and
document-level semantic relationships.
"""

import importlib.resources
import json
import os
from collections.abc import Callable, Sequence
from functools import lru_cache
from glob import glob
from io import BytesIO
from logging import warn
from os.path import basename, splitext
from random import sample
from typing import Literal, TextIO, cast
from zipfile import ZipFile

import requests
from rdflib.plugins.sparql.sparql import Query
from rdflib.query import Result

from ..predpatt import PredPattCorpus
from .annotation import NormalizedUDSAnnotation, RawUDSAnnotation, UDSAnnotation
from .document import SentenceGraphDict, UDSDocument
from .graph import EdgeAttributes, EdgeKey, NodeAttributes, UDSSentenceGraph
from .metadata import AnnotationMetadataDict, UDSCorpusMetadata, UDSPropertyMetadata


type Location = str | TextIO
"""File location as either a file path string or an open file handle."""


[docs] class UDSCorpus(PredPattCorpus): """A collection of Universal Decompositional Semantics graphs. Parameters ---------- sentences the predpatt sentence graphs to associate the annotations with documents the documents associated with the predpatt sentence graphs sentence_annotations additional annotations to associate with predpatt nodes on sentence-level graphs; in most cases, no such annotations will be passed, since the standard UDS annotations are automatically loaded document_annotations additional annotations to associate with predpatt nodes on document-level graphs version the version of UDS datasets to use split the split to load: "train", "dev", or "test" annotation_format which annotation type to load ("raw" or "normalized") """ UD_URL = ( 'https://github.com/UniversalDependencies/' 'UD_English-EWT/archive/r1.2.zip' ) ANN_DIR = f"{importlib.resources.files('decomp') / 'data'}/" CACHE_DIR = f"{importlib.resources.files('decomp') / 'data'}/"
[docs] def __init__(self, sentences: PredPattCorpus | dict[str, UDSSentenceGraph] | None = None, documents: dict[str, UDSDocument] | None = None, sentence_annotations: list[UDSAnnotation] | None = None, document_annotations: list[UDSAnnotation] | None = None, version: str = '2.0', split: str | None = None, annotation_format: str = 'normalized'): self._validate_arguments(sentences, documents, version, split, annotation_format) self.version = version self.annotation_format = annotation_format self._metadata = UDSCorpusMetadata() # methods inherited from Corpus that reference the self._graphs # attribute will operate on sentence-level graphs only # more specific type than parent's dict[Hashable, OutGraph] # we're intentionally narrowing the type from the parent class self._graphs: SentenceGraphDict = {} # type: ignore[assignment] # narrowing parent's dict[Hashable, Any] to dict[str, UDSSentenceGraph] self._sentences = self._graphs self._documents: dict[str, UDSDocument] = {} self._initialize_paths(version, annotation_format) all_built = self._check_build_status() if sentences is None and split in self._sentences_paths: self._load_split(split) elif sentences is None and split is None and all_built: for split in ['train', 'dev', 'test']: self._load_split(split) elif sentences is None: # download UD-EWT udewt = requests.get(self.UD_URL).content if sentence_annotations or document_annotations: warn("sentence and document annotations ignored") self._process_conll(split, udewt) else: if isinstance(sentences, PredPattCorpus): self._sentences = {str(name): UDSSentenceGraph(g, str(name)) for name, g in sentences.items()} self._graphs = self._sentences else: # When sentences is already a dict of UDSSentenceGraph objects self._sentences = sentences self._graphs = self._sentences self._documents = documents or {} if sentence_annotations or document_annotations: self.add_annotation( sentence_annotations or [], document_annotations or [] )
def _validate_arguments( self, sentences: PredPattCorpus | dict[str, UDSSentenceGraph] | None, documents: dict[str, UDSDocument] | None, version: str, split: str | None, annotation_format: str ) -> None: """Validate constructor arguments for consistency. Parameters ---------- sentences : PredPattCorpus | dict[str, UDSSentenceGraph] | None Optional sentence graphs documents : dict[str, UDSDocument] | None Optional document collection version : str UDS version split : str | None Data split (train/dev/test) annotation_format : str Format (raw/normalized) Raises ------ ValueError If arguments are inconsistent or invalid """ # neither documents nor graphs should be supplied to the constructor # without the other if sentences is None and documents is not None: raise ValueError( 'UDS documents were provided without sentences. ' 'Cannot construct corpus.' ) elif sentences is not None and documents is None: raise ValueError( 'UDS sentences were provided without documents. ' 'Cannot construct corpus.' ) if not (split is None or split in ['train', 'dev', 'test']): raise ValueError('split must be "train", "dev", or "test"') if annotation_format not in ['raw', 'normalized']: raise ValueError( f'Unrecognized annotation format {annotation_format}. ' 'Must be either "raw" or "normalized".' ) def _initialize_paths(self, version: str, annotation_format: str) -> None: """Initialize file paths for data loading. Sets up paths for sentence/document graphs and annotations based on version and format. Extracts zip files if needed. Parameters ---------- version : str UDS dataset version annotation_format : str 'raw' or 'normalized' format """ self._sentences_paths = {splitext(basename(p))[0].split('-')[-2]: p for p in glob(os.path.join(self.CACHE_DIR, version, annotation_format, 'sentence', '*.json'))} self._documents_paths = {splitext(basename(p))[0].split('-')[-2]: p for p in glob(os.path.join(self.CACHE_DIR, version, annotation_format, 'document', '*.json'))} self._sentences_annotation_dir = os.path.join(self.ANN_DIR, version, annotation_format, 'sentence', 'annotations') self._documents_annotation_dir = os.path.join(self.ANN_DIR, version, annotation_format, 'document', 'annotations') sent_ann_paths = glob(os.path.join(self._sentences_annotation_dir, '*.json')) doc_ann_paths = glob(os.path.join(self._documents_annotation_dir, '*.json')) # out of the box, the annotations are stored as zip files and the # JSON they contain must be extracted if not sent_ann_paths: zipped_sent_paths = os.path.join(self._sentences_annotation_dir, '*.zip') zipped_sentence_annotations = glob(zipped_sent_paths) for zipped in zipped_sentence_annotations: ZipFile(zipped).extractall(path=self._sentences_annotation_dir) sent_ann_paths = glob(os.path.join(self._sentences_annotation_dir, '*.json')) if not doc_ann_paths: zipped_doc_paths = os.path.join(self._documents_annotation_dir, '*.zip') zipped_document_annotations = glob(zipped_doc_paths) for zipped in zipped_document_annotations: ZipFile(zipped).extractall(path=self._documents_annotation_dir) doc_ann_paths = glob(os.path.join(self._documents_annotation_dir, '*.json')) self._sentence_annotation_paths = sent_ann_paths self._document_annotation_paths = doc_ann_paths def _check_build_status(self) -> bool: """Check if all data splits are built and available. Returns ------- bool True if train/dev/test splits are all available """ sentences_built = bool(self._sentences_paths) and \ all(s in self._sentences_paths for s in ['train', 'dev', 'test']) documents_built = bool(self._documents_paths) and \ all(s in self._documents_paths for s in ['train', 'dev', 'test']) return sentences_built and documents_built def _load_split(self, split: str) -> None: """Load a specific data split into the corpus. Parameters ---------- split : str Split name ('train', 'dev', or 'test') """ sentence_fpath = self._sentences_paths[split] doc_fpath = self._documents_paths[split] split_corpus = self.__class__.from_json(sentence_fpath, doc_fpath) self._metadata += split_corpus.metadata self._sentences.update(split_corpus._sentences) self._documents.update(split_corpus._documents) def _process_conll(self, split: str | None, udewt: bytes) -> None: """Process CoNLL data from UD-EWT archive. Extracts and processes CoNLL files, creates UDS graphs, and saves to cache. Parameters ---------- split : str | None Specific split to process, or None for all udewt : bytes UD-EWT archive content """ with ZipFile(BytesIO(udewt)) as zf: conll_names = [fname for fname in zf.namelist() if splitext(fname)[-1] == '.conllu'] for fn in conll_names: with zf.open(fn) as conll: conll_str = conll.read().decode('utf-8') sname = splitext(basename(fn))[0].split('-')[-1] spl = self.__class__.from_conll_and_annotations(conll_str, self._sentence_annotation_paths, self._document_annotation_paths, annotation_format=self.annotation_format, version=self.version, name=f'ewt-{sname}') if sname == split or split is None: # add metadata self._metadata += spl.metadata # prepare sentences sentences_json_name = ( f'uds-ewt-sentences-{sname}-{self.annotation_format}.json' ) sentences_json_path = os.path.join(self.__class__.CACHE_DIR, self.version, self.annotation_format, 'sentence', sentences_json_name) self._sentences.update(spl._sentences) self._sentences_paths[sname] = sentences_json_path # prepare documents documents_json_name = ( f'uds-ewt-documents-{sname}-{self.annotation_format}.json' ) documents_json_path = os.path.join(self.__class__.CACHE_DIR, self.version, self.annotation_format, 'document', documents_json_name) self._documents.update(spl._documents) self._documents_paths[sname] = documents_json_path # serialize both spl.to_json(sentences_json_path, documents_json_path)
[docs] @classmethod def from_conll_and_annotations( cls, corpus: Location, sentence_annotations: Sequence[Location] = [], document_annotations: Sequence[Location] = [], annotation_format: str = 'normalized', version: str = '2.0', name: str = 'ewt' ) -> 'UDSCorpus': """Load UDS graph corpus from CoNLL (dependencies) and JSON (annotations). This method should only be used if the UDS corpus is being (re)built. Otherwise, loading the corpus from the JSON shipped with this package using UDSCorpus.__init__ or UDSCorpus.from_json is suggested. Parameters ---------- corpus (path to) Universal Dependencies corpus in conllu format sentence_annotations a list of paths to JSON files or open JSON files containing sentence-level annotations document_annotations a list of paths to JSON files or open JSON files containing document-level annotations annotation_format Whether the annotation is raw or normalized version the version of UDS datasets to use name corpus name to be appended to the beginning of graph ids """ # select appropriate loader based on format loader: Callable[[str | TextIO], RawUDSAnnotation | NormalizedUDSAnnotation] if annotation_format == 'raw': loader = RawUDSAnnotation.from_json elif annotation_format == 'normalized': loader = NormalizedUDSAnnotation.from_json else: raise ValueError('annotation_format must be either' '"raw" or "normalized"') predpatt_corpus = PredPattCorpus.from_conll(corpus, name=name) predpatt_sentence_graphs = {str(graph_name): UDSSentenceGraph(g, str(graph_name)) for graph_name, g in predpatt_corpus.items()} predpatt_documents = cls._initialize_documents(predpatt_sentence_graphs) # process sentence-level graph annotations processed_sentence_annotations = [] for ann_path in sentence_annotations: ann = loader(ann_path) processed_sentence_annotations.append(ann) # process document-level graph annotations processed_document_annotations = [] for ann_path in document_annotations: ann = loader(ann_path) processed_document_annotations.append(ann) # create corpus and add annotations after creation # cast needed because constructor expects PredPattCorpus but we have # dict[str, UDSSentenceGraph] uds_corpus: UDSCorpus = cls( cast(PredPattCorpus | None, predpatt_sentence_graphs), predpatt_documents ) # add sentence annotations for ann in processed_sentence_annotations: uds_corpus.add_sentence_annotation(ann) # add document annotations for ann in processed_document_annotations: uds_corpus.add_document_annotation(ann) return uds_corpus
@classmethod def _load_ud_ids( cls, sentence_ids_only: bool = False ) -> dict[str, dict[str, str]] | dict[str, str]: """Load Universal Dependencies IDs for sentences and documents. Parameters ---------- sentence_ids_only : bool, optional If True, return only sentence IDs. Default is False. Returns ------- dict[str, dict[str, str]] | dict[str, str] Full ID mapping or just sentence IDs based on parameter """ # load in the document and sentence IDs for each sentence-level graph ud_ids_path = os.path.join(cls.ANN_DIR, 'ud_ids.json') with open(ud_ids_path) as ud_ids_file: ud_ids: dict[str, dict[str, str]] = json.load(ud_ids_file) if sentence_ids_only: return {k: v['sentence_id'] for k, v in ud_ids.items()} else: return ud_ids
[docs] @classmethod def from_json( cls, sentences_jsonfile: Location, documents_jsonfile: Location ) -> 'UDSCorpus': """Load annotated UDS graph corpus (including annotations) from JSON. This is the suggested method for loading the UDS corpus. Parameters ---------- sentences_jsonfile file containing Universal Decompositional Semantics corpus sentence-level graphs in JSON format documents_jsonfile file containing Universal Decompositional Semantics corpus document-level graphs in JSON format """ sentences_ext = splitext( basename(sentences_jsonfile if isinstance(sentences_jsonfile, str) else 'dummy.json') )[-1] documents_ext = splitext( basename(documents_jsonfile if isinstance(documents_jsonfile, str) else 'dummy.json') )[-1] sent_ids = cast(dict[str, str], cls._load_ud_ids(sentence_ids_only=True)) # process sentence-level graphs if isinstance(sentences_jsonfile, str) and sentences_ext == '.json': with open(sentences_jsonfile) as infile: sentences_json = json.load(infile) elif isinstance(sentences_jsonfile, str): sentences_json = json.loads(sentences_jsonfile) else: sentences_json = json.load(sentences_jsonfile) sentences: dict[str, UDSSentenceGraph] = { name: cast(UDSSentenceGraph, UDSSentenceGraph.from_dict(g_json, name)) for name, g_json in sentences_json['data'].items() } # process document-level graphs if isinstance(documents_jsonfile, str) and documents_ext == '.json': with open(documents_jsonfile) as infile: documents_json = json.load(infile) elif isinstance(documents_jsonfile, str): documents_json = json.loads(documents_jsonfile) else: documents_json = json.load(documents_jsonfile) documents = {name: UDSDocument.from_dict(d_json, sentences, sent_ids, name) for name, d_json in documents_json['data'].items()} corpus = cls(sentences, documents) metadata_dict = { 'sentence_metadata': sentences_json['metadata'], 'document_metadata': documents_json['metadata'] } metadata = UDSCorpusMetadata.from_dict(cast( dict[Literal['sentence_metadata', 'document_metadata'], AnnotationMetadataDict], metadata_dict )) corpus.add_corpus_metadata(metadata) return corpus
[docs] def add_corpus_metadata(self, metadata: UDSCorpusMetadata) -> None: """Add metadata to the corpus. Parameters ---------- metadata : UDSCorpusMetadata Metadata to merge with existing corpus metadata """ self._metadata += metadata
[docs] def add_annotation( self, sentence_annotation: list[UDSAnnotation] | None = None, document_annotation: list[UDSAnnotation] | None = None ) -> None: """Add annotations to UDS sentence and document graphs. Parameters ---------- sentence_annotation the annotations to add to the sentence graphs in the corpus document_annotation the annotations to add to the document graphs in the corpus """ if sentence_annotation: for ann in sentence_annotation: self.add_sentence_annotation(ann) if document_annotation: for ann in document_annotation: self.add_document_annotation(ann)
[docs] def add_sentence_annotation(self, annotation: UDSAnnotation) -> None: """Add annotations to UDS sentence graphs. Parameters ---------- annotation the annotations to add to the graphs in the corpus """ self._metadata.add_sentence_metadata(annotation.metadata) for gname, (node_attrs, edge_attrs) in annotation.items(): if gname in self._sentences: self._sentences[gname].add_annotation( cast(dict[str, NodeAttributes], node_attrs), cast(dict[EdgeKey, EdgeAttributes], edge_attrs) )
[docs] def add_document_annotation(self, annotation: UDSAnnotation) -> None: """Add annotations to UDS documents. Parameters ---------- annotation the annotations to add to the documents in the corpus """ self._metadata.add_document_metadata(annotation.metadata) for dname, (node_attrs, edge_attrs) in annotation.items(): if dname in self._documents: self._documents[dname].add_annotation( cast(dict[str, NodeAttributes], node_attrs), cast(dict[EdgeKey, EdgeAttributes], edge_attrs) )
@classmethod def _initialize_documents(cls, graphs: dict[str, UDSSentenceGraph]) -> dict[str, UDSDocument]: """Create document collection from sentence graphs. Groups sentence graphs by document ID and creates UDSDocument objects. Parameters ---------- graphs : dict[str, UDSSentenceGraph] Sentence graphs to organize into documents Returns ------- dict[str, UDSDocument] Documents keyed by document ID """ # load the UD document and sentence IDs ud_ids = cast(dict[str, dict[str, str]], cls._load_ud_ids()) # add each graph to the appropriate document documents: dict[str, UDSDocument] = {} for name, graph in graphs.items(): doc_id = ud_ids[name]['document_id'] sent_id = ud_ids[name]['sentence_id'] graph.document_id = doc_id graph.sentence_id = sent_id # add the graph to an existing document if doc_id in documents: documents[doc_id].add_sentence_graphs({name: graph}, {name: sent_id}) # create a new document else: genre = doc_id.split('-')[0] timestamp = UDSDocument._get_timestamp_from_document_name(doc_id) documents[doc_id] =\ UDSDocument({name: graph}, {name: sent_id}, doc_id, genre, timestamp) return documents
[docs] def to_json(self, sentences_outfile: Location | None = None, documents_outfile: Location | None = None) -> str | None: """Serialize corpus to json. Parameters ---------- sentences_outfile file to serialize sentence-level graphs to documents_outfile file to serialize document-level graphs to """ metadata_serializable = self._metadata.to_dict() # convert graphs to dictionaries sentences_serializable = {'metadata': metadata_serializable['sentence_metadata'], 'data': {name: graph.to_dict() for name, graph in self._sentences.items()}} if sentences_outfile is None: return json.dumps(sentences_serializable) elif isinstance(sentences_outfile, str): with open(sentences_outfile, 'w') as out: json.dump(sentences_serializable, out) else: json.dump(sentences_serializable, sentences_outfile) # serialize documents (note: we serialize only the *graphs* # for each document, not the metadata, which is loaded by # other means when calling UDSDocument.from_dict) documents_serializable = {'metadata': metadata_serializable['document_metadata'], 'data': {name: doc.document_graph.to_dict() for name, doc in self._documents.items()}} if documents_outfile is None: return json.dumps(documents_serializable) elif isinstance(documents_outfile, str): with open(documents_outfile, 'w') as out: json.dump(documents_serializable, out) else: json.dump(documents_serializable, documents_outfile) return None
[docs] @lru_cache(maxsize=128) # noqa: B019 def query(self, query: str | Query, query_type: str | None = None, cache_query: bool = True, cache_rdf: bool = True ) -> dict[str, Result | dict[str, NodeAttributes] | dict[EdgeKey, EdgeAttributes]]: """Query all graphs in the corpus using SPARQL 1.1. Parameters ---------- query a SPARQL 1.1 query query_type whether this is a 'node' query or 'edge' query. If set to None (default), a Results object will be returned. The main reason to use this option is to automatically format the output of a custom query, since Results objects require additional postprocessing. cache_query whether to cache the query. This should usually be set to True. It should generally only be False when querying particular nodes or edges--e.g. as in precompiled queries. clear_rdf whether to delete the RDF constructed for querying against. This will slow down future queries but saves a lot of memory """ return {str(gid): graph.query(query, query_type, cache_query, cache_rdf) for gid, graph in self.items()}
@property def documents(self) -> dict[str, UDSDocument]: """The documents in the corpus. Returns ------- dict[str, UDSDocument] Mapping from document IDs to UDSDocument objects """ return self._documents @property def documentids(self) -> list[str]: """The document IDs in the corpus. Returns ------- list[str] List of all document identifiers """ return list(self._documents) @property def ndocuments(self) -> int: """The number of documents in the corpus. Returns ------- int Total document count """ return len(self._documents)
[docs] def sample_documents(self, k: int) -> dict[str, UDSDocument]: """Sample k documents without replacement. Parameters ---------- k the number of documents to sample """ return {doc_id: self._documents[doc_id] for doc_id in sample(list(self._documents.keys()), k=k)}
@property def metadata(self) -> UDSCorpusMetadata: """The corpus metadata. Returns ------- UDSCorpusMetadata Metadata for sentence and document annotations """ return self._metadata @property def sentence_node_subspaces(self) -> set[str]: """The UDS sentence node subspaces in the corpus. Returns ------- set[str] Set of subspace names for sentence nodes Raises ------ NotImplementedError This property is not yet implemented """ raise NotImplementedError @property def sentence_edge_subspaces(self) -> set[str]: """The UDS sentence edge subspaces in the corpus. Returns ------- set[str] Set of subspace names for sentence edges Raises ------ NotImplementedError This property is not yet implemented """ raise NotImplementedError @property def sentence_subspaces(self) -> set[str]: """All UDS sentence subspaces (node and edge) in the corpus. Returns ------- set[str] Union of sentence node and edge subspaces """ return self.sentence_node_subspaces |\ self.sentence_edge_subspaces @property def document_node_subspaces(self) -> set[str]: """The UDS document node subspaces in the corpus. Returns ------- set[str] Set of subspace names for document nodes Raises ------ NotImplementedError This property is not yet implemented """ raise NotImplementedError @property def document_edge_subspaces(self) -> set[str]: """The UDS document edge subspaces in the corpus. Returns ------- set[str] Set of subspace names for document edges """ return self._metadata.document_edge_subspaces # type: ignore[no-any-return,attr-defined] @property def document_subspaces(self) -> set[str]: """All UDS document subspaces (node and edge) in the corpus. Returns ------- set[str] Union of document node and edge subspaces """ return self.document_node_subspaces |\ self.document_edge_subspaces
[docs] def sentence_properties(self, subspace: str | None = None) -> set[str]: """Return the properties in a sentence subspace. Parameters ---------- subspace : str | None, optional Subspace to query, or None for all properties Returns ------- set[str] Property names in the subspace Raises ------ NotImplementedError This method is not yet implemented """ raise NotImplementedError
[docs] def sentence_property_metadata( self, subspace: str, prop: str ) -> UDSPropertyMetadata: """Return the metadata for a property in a sentence subspace. Parameters ---------- subspace The subspace the property is in prop The property in the subspace """ raise NotImplementedError
[docs] def document_properties(self, subspace: str | None = None) -> set[str]: """Return the properties in a document subspace. Parameters ---------- subspace : str | None, optional Subspace to query, or None for all properties Returns ------- set[str] Property names in the subspace Raises ------ NotImplementedError This method is not yet implemented """ raise NotImplementedError
[docs] def document_property_metadata( self, subspace: str, prop: str ) -> UDSPropertyMetadata: """Return the metadata for a property in a document subspace. Parameters ---------- subspace The subspace the property is in prop The property in the subspace """ raise NotImplementedError