Source code for decomp.semantics.predpatt.core.token

"""Token representation for dependency parsing in PredPatt.

This module defines the core Token class that represents individual
tokens (words) in a dependency parse tree. Tokens store linguistic information
including text, part-of-speech tags, and dependency relations.

Classes
-------
Token
    Represents a single token with its linguistic properties and dependency
    relations. Used as the basic unit in dependency parsing for predicate-argument
    extraction.
"""

from __future__ import annotations

from typing import TYPE_CHECKING

from ..utils.ud_schema import dep_v1, postag


if TYPE_CHECKING:
    from ..parsing.udparse import DepTriple
    from ..typing import UDSchema


[docs] class Token: """Represents a single token in a dependency parse. Attributes ---------- position : int The position of the token in the sentence (0-based). text : str The text content of the token. tag : str The part-of-speech tag of the token. dependents : list[DepTriple] | None List of dependent edges where this token is the governor. Initially set to None. gov : Token | None The governing token (parent) in the dependency tree. Initially set to None. gov_rel : str | None The dependency relation to the governing token. Initially set to None. ud : UDSchema The Universal Dependencies module (dep_v1 or dep_v2) that defines relation types and constants. """
[docs] def __init__(self, position: int, text: str, tag: str, ud: UDSchema = dep_v1) -> None: """Initialize a Token. Parameters ---------- position : int The position of the token in the sentence (0-based). text : str The text content of the token. tag : str The part-of-speech tag of the token. ud : UDSchema, optional The Universal Dependencies module, by default dep_v1. """ # maintain exact initialization order self.position: int = position self.text: str = text self.tag: str = tag self.dependents: list[DepTriple] | None = None self.gov: Token | None = None self.gov_rel: str | None = None self.ud: UDSchema = ud
[docs] def __repr__(self) -> str: """Return string representation of the token. Returns ------- str String in format 'text/position'. """ return f'{self.text}/{self.position}'
@property def isword(self) -> bool: """Check if the token is not punctuation. Returns ------- bool True if the token is not punctuation, False otherwise. """ return self.tag != postag.PUNCT
[docs] def argument_like(self) -> bool: """Check if this token looks like the root of an argument. Returns ------- bool True if the token's gov_rel is in ARG_LIKE relations. """ return self.gov_rel in self.ud.ARG_LIKE
[docs] def hard_to_find_arguments(self) -> bool: """Check if this is potentially the root of a predicate with hard-to-find arguments. This func is only called when one of its dependents is an easy predicate. Here, we're checking: Is this potentially the root of an easy predicate, which will have an argment? Returns ------- bool True if this could be a predicate root with hard-to-find arguments. """ # amod: # there is nothing wrong with a negotiation, # but nothing helpful about generating one that is just for show . # ^ ^ ^ # --amod-- (a easy predicate, dependent of "helpful" # which is hard_to_find_arguments) if self.dependents is None: raise TypeError( f"Cannot iterate over None dependents for token '{self.text}' " f"at position {self.position}. Token not properly initialized " f"with dependency information." ) for e in self.dependents: if e.rel in self.ud.SUBJ or e.rel in self.ud.OBJ: return False return self.gov_rel in self.ud.HARD_TO_FIND_ARGS