# pylint: disable=W0221# pylint: disable=R0903# pylint: disable=R1704"""Corpus management for PredPatt semantic extractions.This module provides functionality for loading and managing collections ofPredPatt semantic graphs from CoNLL-U format dependency corpora.Classes-------PredPattCorpus Container class extending the base Corpus for managing PredPatt semantic extractions paired with their dependency graphs."""fromcollections.abcimportHashablefromos.pathimportbasename,splitextfromtypingimportTextIO,castfromnetworkximportDiGraphfrom...corpusimportCorpusfrom...syntax.dependencyimportCoNLLDependencyTreeCorpusfrom.core.optionsimportPredPattOptsfrom.extraction.engineimportPredPattEngineasPredPattfrom.graphimportPredPattGraphBuilderfrom.parsing.loaderimportload_conllu
[docs]classPredPattCorpus(Corpus[tuple[PredPatt,DiGraph],DiGraph]):"""Container for managing collections of PredPatt semantic graphs. This class extends the base Corpus class to handle PredPatt extractions paired with their dependency graphs. It provides methods for loading corpora from CoNLL format and converting them to NetworkX graphs with semantic annotations. """def_graphbuilder(self,graphid:Hashable,predpatt_depgraph:tuple[PredPatt,DiGraph])->DiGraph:"""Build a unified graph from PredPatt extraction and dependency parse. Combines syntactic information from the dependency graph with semantic predicate-argument structures extracted by PredPatt into a single NetworkX graph representation. Parameters ---------- graphid : Hashable Unique identifier for the graph, used as prefix for node IDs predpatt_depgraph : tuple[PredPatt, DiGraph] Tuple containing the PredPatt extraction and its source dependency graph Returns ------- DiGraph NetworkX graph containing both syntactic and semantic layers """predpatt,depgraph=predpatt_depgraphreturnPredPattGraphBuilder.from_predpatt(predpatt,depgraph,str(graphid))
[docs]@classmethoddeffrom_conll(cls,corpus:str|TextIO,name:str='ewt',options:PredPattOpts|None=None)->'PredPattCorpus':"""Load a CoNLL-U dependency corpus and extract predicate-argument structures. Parses Universal Dependencies format data and applies PredPatt extraction rules to identify predicates and their arguments. Each sentence in the corpus is processed to create a semantic graph. Parameters ---------- corpus : str | TextIO Path to a .conllu file, raw CoNLL-U formatted string, or open file handle name : str, optional Corpus name used as prefix for graph identifiers. Default is 'ewt' options : PredPattOpts | None, optional Configuration options for PredPatt extraction. If None, uses default options with relative clause resolution and argument borrowing enabled Returns ------- PredPattCorpus Corpus containing PredPatt extractions and their graphs Raises ------ ValueError If PredPatt cannot parse the provided CoNLL-U data, likely due to incompatible Universal Dependencies version """# Import here to avoid circular importfrom.importDEFAULT_PREDPATT_OPTIONSoptions=DEFAULT_PREDPATT_OPTIONSifoptionsisNoneelseoptionscorp_is_str=isinstance(corpus,str)ifcorp_is_strandsplitext(basename(cast(str,corpus)))[1]=='.conllu':withopen(cast(str,corpus))asinfile:data=infile.read()elifcorp_is_str:data=cast(str,corpus)else:data=cast(TextIO,corpus).read()# load the CoNLL dependency parses as graphsud_corp_dict={f"{name}-{i+1}":[line.split()forlineinblock.split('\n')iflen(line)>0ifline[0]!='#']fori,blockinenumerate(data.split('\n\n'))}ud_corp_hashable={cast(Hashable,k):vfork,vinud_corp_dict.items()}ud_corp=CoNLLDependencyTreeCorpus(ud_corp_hashable)# extract the predpatt for those dependency parsestry:predpatt={f"{name}-{sid.split('_')[1]}":PredPatt(ud_parse,opts=options)forsid,ud_parseinload_conllu(data)}exceptValueError:errmsg=("PredPatt was unable to parse the CoNLL you provided. ""This is likely due to using a version of UD that is ""incompatible with PredPatt. Use of version 1.2 is suggested.")raiseValueError(errmsg)fromNonereturncls({n:(pp,ud_corp[n])forn,ppinpredpatt.items()})