Source code for decomp.semantics.uds.document

"""Module for representing UDS documents with sentence-level and document-level graphs.

This module provides the UDSDocument class for managing Universal Decompositional Semantics
(UDS) documents. Each document contains:

- A collection of sentence-level graphs (UDSSentenceGraph)
- A document-level graph (UDSDocumentGraph) connecting nodes across sentences
- Metadata including document name, genre, and timestamp
- Methods for adding sentences and annotations to the document

The document structure preserves the hierarchical relationship between documents
and their constituent sentences while enabling document-level semantic annotations.
"""

import re
from functools import cached_property
from typing import cast

from networkx import DiGraph

from .graph import EdgeAttributes, EdgeKey, NodeAttributes, UDSDocumentGraph, UDSSentenceGraph
from .types import BasicNodeAttrs, NetworkXGraphData


# type aliases
type SentenceGraphDict = dict[str, UDSSentenceGraph]
"""Mapping from graph names to their UDSSentenceGraph objects."""

type SentenceIDDict = dict[str, str]
"""Mapping from graph names to their UD sentence identifiers."""



[docs]
class UDSDocument:
    """A Universal Decompositional Semantics document.

    Parameters
    ----------
    sentence_graphs
        the UDSSentenceGraphs associated with each sentence in the document
    sentence_ids
        the UD sentence IDs for each graph
    name
        the name of the document (i.e. the UD document ID)
    genre
        the genre of the document (e.g. `weblog`)
    timestamp
        the timestamp of the UD document on which this UDSDocument is based
    doc_graph
        the NetworkX DiGraph for the document. If not provided, this will be
        initialized without edges from sentence_graphs
    """


[docs]
    def __init__(
        self,
        sentence_graphs: SentenceGraphDict,
        sentence_ids: SentenceIDDict,
        name: str,
        genre: str,
        timestamp: str | None = None,
        doc_graph: UDSDocumentGraph | None = None
    ):
        self.sentence_graphs: SentenceGraphDict = {}
        self.sentence_ids: SentenceIDDict = {}
        self.name = name
        self.genre = genre
        self.timestamp = timestamp

        # Initialize the document-level graph
        if doc_graph:
            self.document_graph = doc_graph
        else:
            self.document_graph = UDSDocumentGraph(DiGraph(), name)

        # Initialize the sentence-level graphs
        self.add_sentence_graphs(sentence_graphs, sentence_ids)



[docs]
    def to_dict(self) -> NetworkXGraphData:
        """Convert the document graph to a dictionary.

        Returns
        -------
        NetworkXGraphData
            NetworkX adjacency data format for the document graph
        """
        return self.document_graph.to_dict()



[docs]
    @classmethod
    def from_dict(
        cls,
        document: dict[str, dict],
        sentence_graphs: dict[str, UDSSentenceGraph],
        sentence_ids: dict[str, str],
        name: str = 'UDS'
    ) -> 'UDSDocument':
        """Construct a UDSDocument from a dictionary.

        Since only the document graphs are serialized, the sentence
        graphs must also be provided to this method call in order
        to properly associate them with their documents.

        Parameters
        ----------
        document
            a dictionary constructed by networkx.adjacency_data,
            containing the graph for the document
        sentence_graphs
            a dictionary containing (possibly a superset of) the
            sentence-level graphs for the sentences in the document
        sentence_ids
            a dictionary containing (possibly a superset of) the
            UD sentence IDs for each graph
        name
            identifier to append to the beginning of node ids
        """
        document_graph = cast(UDSDocumentGraph, UDSDocumentGraph.from_dict(document, name))
        sent_graph_names = set(map(lambda node: node['semantics']['graph'], document['nodes']))
        sent_graphs = {}
        sent_ids = {}
        for gname in sent_graph_names:
            sentence_graphs[gname].document_id = name
            sentence_graphs[gname].sentence_id = sentence_ids[gname]
            sent_graphs[gname] = sentence_graphs[gname]
            sent_ids[gname] = sentence_ids[gname]
        genre = name.split('-')[0]
        timestamp = cls._get_timestamp_from_document_name(name)
        return cls(sent_graphs, sent_ids, name, genre, timestamp, document_graph)


    @staticmethod
    def _get_timestamp_from_document_name(document_name: str) -> str | None:
        """Extract timestamp from document name.

        Looks for patterns like 'YYYYMMDD_HHMMSS' or 'YYYYMMDDHHMMSS'
        in the document name.

        Parameters
        ----------
        document_name : str
            The document name to parse

        Returns
        -------
        str | None
            The timestamp string if found, None otherwise
        """
        timestamp = re.search(r'\d{8}_?\d{6}', document_name)

        return timestamp[0] if timestamp else None


[docs]
    def add_sentence_graphs(
        self,
        sentence_graphs: SentenceGraphDict,
        sentence_ids: SentenceIDDict
    ) -> None:
        """Add sentence graphs to the document.

        Creates document-level nodes for each semantics node in the sentence
        graphs and updates the sentence graph metadata with document information.

        Parameters
        ----------
        sentence_graphs : SentenceGraphDict
            Dictionary mapping graph names to UDSSentenceGraph objects
        sentence_ids : SentenceIDDict
            Dictionary mapping graph names to UD sentence identifiers
        """
        for gname, graph in sentence_graphs.items():
            sentence_graphs[gname].sentence_id = sentence_ids[gname]
            sentence_graphs[gname].document_id = self.name

            self.sentence_graphs[gname] = graph
            self.sentence_ids[gname] = sentence_ids[gname]

            for node_name, node in graph.semantics_nodes.items():
                semantics = {'graph': gname, 'node': node_name}
                document_node_name = node_name.replace('semantics', 'document')
                self.document_graph.graph.add_node(
                    document_node_name,
                    domain='document', type=node['type'],
                    frompredpatt=False, semantics=semantics
                )



[docs]
    def add_annotation(
        self,
        node_attrs: dict[str, NodeAttributes],
        edge_attrs: dict[EdgeKey, EdgeAttributes]
    ) -> None:
        """Add annotations to the document-level graph.

        Delegates to the document graph's add_annotation method, passing
        along the sentence IDs for validation.

        Parameters
        ----------
        node_attrs : dict[str, NodeAttributes]
            Node annotations keyed by node ID
        edge_attrs : dict[EdgeKey, EdgeAttributes]
            Edge annotations keyed by (source, target) tuples
        """
        self.document_graph.add_annotation(node_attrs, edge_attrs, self.sentence_ids)



[docs]
    def semantics_node(self, document_node: str) -> dict[str, BasicNodeAttrs]:
        """Get the semantics node corresponding to a document node.

        Document nodes maintain references to their corresponding semantics
        nodes through the 'semantics' attribute, which contains the graph
        name and node ID.

        Parameters
        ----------
        document_node : str
            The document domain node ID

        Returns
        -------
        dict[str, BasicNodeAttrs]
            Single-item dict mapping node ID to its attributes

        Raises
        ------
        TypeError
            If the semantics attribute is not a dictionary
        KeyError
            If required keys are missing from semantics dict
        """
        semantics = self.document_graph.nodes[document_node]['semantics']
        if not isinstance(semantics, dict):
            raise TypeError(f"Expected 'semantics' to be a dict but got {type(semantics)}")
        if 'graph' not in semantics or 'node' not in semantics:
            raise KeyError("Expected 'semantics' dict to have 'graph' and 'node' keys")
        graph_id = cast(str, semantics['graph'])
        node_id = cast(str, semantics['node'])
        semantics_node = self.sentence_graphs[graph_id].semantics_nodes[node_id]
        return {node_id: cast(BasicNodeAttrs, semantics_node)}


    @cached_property
    def text(self) -> str:
        """The full document text reconstructed from sentences.

        Concatenates the text from all sentence graphs in sorted order
        with space separation.

        Returns
        -------
        str
            The complete document text
        """
        return ' '.join([
            sent_graph.sentence
            for gname, sent_graph in sorted(self.sentence_graphs.items())
        ])