Source code for decomp.semantics.uds.document

"""Module for representing UDS documents with sentence-level and document-level graphs.

This module provides the UDSDocument class for managing Universal Decompositional Semantics
(UDS) documents. Each document contains:

- A collection of sentence-level graphs (UDSSentenceGraph)
- A document-level graph (UDSDocumentGraph) connecting nodes across sentences
- Metadata including document name, genre, and timestamp
- Methods for adding sentences and annotations to the document

The document structure preserves the hierarchical relationship between documents
and their constituent sentences while enabling document-level semantic annotations.
"""

import re
from functools import cached_property
from typing import cast

from networkx import DiGraph

from .graph import EdgeAttributes, EdgeKey, NodeAttributes, UDSDocumentGraph, UDSSentenceGraph
from .types import BasicNodeAttrs, NetworkXGraphData


# type aliases
type SentenceGraphDict = dict[str, UDSSentenceGraph]
"""Mapping from graph names to their UDSSentenceGraph objects."""

type SentenceIDDict = dict[str, str]
"""Mapping from graph names to their UD sentence identifiers."""


[docs] class UDSDocument: """A Universal Decompositional Semantics document. Parameters ---------- sentence_graphs the UDSSentenceGraphs associated with each sentence in the document sentence_ids the UD sentence IDs for each graph name the name of the document (i.e. the UD document ID) genre the genre of the document (e.g. `weblog`) timestamp the timestamp of the UD document on which this UDSDocument is based doc_graph the NetworkX DiGraph for the document. If not provided, this will be initialized without edges from sentence_graphs """
[docs] def __init__( self, sentence_graphs: SentenceGraphDict, sentence_ids: SentenceIDDict, name: str, genre: str, timestamp: str | None = None, doc_graph: UDSDocumentGraph | None = None ): self.sentence_graphs: SentenceGraphDict = {} self.sentence_ids: SentenceIDDict = {} self.name = name self.genre = genre self.timestamp = timestamp # Initialize the document-level graph if doc_graph: self.document_graph = doc_graph else: self.document_graph = UDSDocumentGraph(DiGraph(), name) # Initialize the sentence-level graphs self.add_sentence_graphs(sentence_graphs, sentence_ids)
[docs] def to_dict(self) -> NetworkXGraphData: """Convert the document graph to a dictionary. Returns ------- NetworkXGraphData NetworkX adjacency data format for the document graph """ return self.document_graph.to_dict()
[docs] @classmethod def from_dict( cls, document: dict[str, dict], sentence_graphs: dict[str, UDSSentenceGraph], sentence_ids: dict[str, str], name: str = 'UDS' ) -> 'UDSDocument': """Construct a UDSDocument from a dictionary. Since only the document graphs are serialized, the sentence graphs must also be provided to this method call in order to properly associate them with their documents. Parameters ---------- document a dictionary constructed by networkx.adjacency_data, containing the graph for the document sentence_graphs a dictionary containing (possibly a superset of) the sentence-level graphs for the sentences in the document sentence_ids a dictionary containing (possibly a superset of) the UD sentence IDs for each graph name identifier to append to the beginning of node ids """ document_graph = cast(UDSDocumentGraph, UDSDocumentGraph.from_dict(document, name)) sent_graph_names = set(map(lambda node: node['semantics']['graph'], document['nodes'])) sent_graphs = {} sent_ids = {} for gname in sent_graph_names: sentence_graphs[gname].document_id = name sentence_graphs[gname].sentence_id = sentence_ids[gname] sent_graphs[gname] = sentence_graphs[gname] sent_ids[gname] = sentence_ids[gname] genre = name.split('-')[0] timestamp = cls._get_timestamp_from_document_name(name) return cls(sent_graphs, sent_ids, name, genre, timestamp, document_graph)
@staticmethod def _get_timestamp_from_document_name(document_name: str) -> str | None: """Extract timestamp from document name. Looks for patterns like 'YYYYMMDD_HHMMSS' or 'YYYYMMDDHHMMSS' in the document name. Parameters ---------- document_name : str The document name to parse Returns ------- str | None The timestamp string if found, None otherwise """ timestamp = re.search(r'\d{8}_?\d{6}', document_name) return timestamp[0] if timestamp else None
[docs] def add_sentence_graphs( self, sentence_graphs: SentenceGraphDict, sentence_ids: SentenceIDDict ) -> None: """Add sentence graphs to the document. Creates document-level nodes for each semantics node in the sentence graphs and updates the sentence graph metadata with document information. Parameters ---------- sentence_graphs : SentenceGraphDict Dictionary mapping graph names to UDSSentenceGraph objects sentence_ids : SentenceIDDict Dictionary mapping graph names to UD sentence identifiers """ for gname, graph in sentence_graphs.items(): sentence_graphs[gname].sentence_id = sentence_ids[gname] sentence_graphs[gname].document_id = self.name self.sentence_graphs[gname] = graph self.sentence_ids[gname] = sentence_ids[gname] for node_name, node in graph.semantics_nodes.items(): semantics = {'graph': gname, 'node': node_name} document_node_name = node_name.replace('semantics', 'document') self.document_graph.graph.add_node( document_node_name, domain='document', type=node['type'], frompredpatt=False, semantics=semantics )
[docs] def add_annotation( self, node_attrs: dict[str, NodeAttributes], edge_attrs: dict[EdgeKey, EdgeAttributes] ) -> None: """Add annotations to the document-level graph. Delegates to the document graph's add_annotation method, passing along the sentence IDs for validation. Parameters ---------- node_attrs : dict[str, NodeAttributes] Node annotations keyed by node ID edge_attrs : dict[EdgeKey, EdgeAttributes] Edge annotations keyed by (source, target) tuples """ self.document_graph.add_annotation(node_attrs, edge_attrs, self.sentence_ids)
[docs] def semantics_node(self, document_node: str) -> dict[str, BasicNodeAttrs]: """Get the semantics node corresponding to a document node. Document nodes maintain references to their corresponding semantics nodes through the 'semantics' attribute, which contains the graph name and node ID. Parameters ---------- document_node : str The document domain node ID Returns ------- dict[str, BasicNodeAttrs] Single-item dict mapping node ID to its attributes Raises ------ TypeError If the semantics attribute is not a dictionary KeyError If required keys are missing from semantics dict """ semantics = self.document_graph.nodes[document_node]['semantics'] if not isinstance(semantics, dict): raise TypeError(f"Expected 'semantics' to be a dict but got {type(semantics)}") if 'graph' not in semantics or 'node' not in semantics: raise KeyError("Expected 'semantics' dict to have 'graph' and 'node' keys") graph_id = cast(str, semantics['graph']) node_id = cast(str, semantics['node']) semantics_node = self.sentence_graphs[graph_id].semantics_nodes[node_id] return {node_id: cast(BasicNodeAttrs, semantics_node)}
@cached_property def text(self) -> str: """The full document text reconstructed from sentences. Concatenates the text from all sentence graphs in sorted order with space separation. Returns ------- str The complete document text """ return ' '.join([ sent_graph.sentence for gname, sent_graph in sorted(self.sentence_graphs.items()) ])