"""Load different sources of data.
This module provides functions to load dependency parses from various formats,
particularly focusing on CoNLL-U format files.
"""
from __future__ import annotations
import codecs
import os
from collections.abc import Iterator
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from concrete import Sentence, Tokenization
from ..parsing.udparse import DepTriple, UDParse
[docs]
def load_comm(
filename: str,
tool: str = 'ud converted ptb trees using pyStanfordDependencies'
) -> Iterator[tuple[str, UDParse]]:
"""Load a concrete communication file with required pyStanfordDependencies output.
.. warning::
This function is part of a planned parsing feature that is not yet fully supported.
It requires the ``concrete`` package (available via ``pip install decomp[parsing]``).
Full parsing functionality with modern UD parsers will be added in a future release.
Parameters
----------
filename : str
Path to the concrete communication file.
tool : str, optional
The tool name to look for in the dependency parse metadata.
Yields
------
tuple[str, UDParse]
Tuples of (section_label, parse) for each sentence.
Raises
------
ImportError
If the concrete package is not installed.
"""
try:
# import here to avoid requiring concrete
from concrete.util.file_io import read_communication_from_file
except ImportError as e:
raise ImportError(
"The 'concrete' package is required to use load_comm(). "
"Install it with: pip install concrete"
) from e
comm = read_communication_from_file(filename)
if comm.sectionList:
for sec in comm.sectionList:
if sec.sentenceList:
for sent in sec.sentenceList:
yield sec.label, get_udparse(sent, tool)
[docs]
def load_conllu(filename_or_content: str) -> Iterator[tuple[str, UDParse]]:
"""Load CoNLL-U style files (e.g., the Universal Dependencies treebank).
Parameters
----------
filename_or_content : str
Either a path to a CoNLL-U file or the content string itself.
Yields
------
tuple[str, UDParse]
Tuples of (sentence_id, parse) for each sentence in the file.
Notes
-----
- Sentence IDs default to "sent_N" where N starts at 1
- Lines starting with "# sent_id" override the sentence ID
- Other comment lines (starting with #) are used as ID if no sent_id found
- Multi-token lines (with '-' in first column) are skipped
- Expects 10 tab-separated columns per data line
"""
sent_num = 1
try:
if os.path.isfile(filename_or_content):
with codecs.open(filename_or_content, encoding='utf-8') as f:
content = f.read().strip()
else:
content = filename_or_content.strip()
except ValueError:
# work around an issue on windows: `os.path.isfile` will call `os.stat`,
# which throws a ValueError if the "filename" is too long. Possibly
# a python bug in that this could be caught in os.path.isfile? Though
# I found some related issues where discussion suggests it was deemed
# not a bug.
content = filename_or_content.strip()
for block in content.split('\n\n'):
block = block.strip()
if not block:
continue
lines = []
sent_id = f'sent_{sent_num}'
has_sent_id = 0
for line in block.split('\n'):
if line.startswith('#'):
if line.startswith('# sent_id'):
sent_id = line[10:].strip()
has_sent_id = 1
else:
if not has_sent_id: # don't take subsequent comments as sent_id
sent_id = line[1:].strip()
continue
parts = line.split('\t') # data appears to use '\t'
if '-' in parts[0]: # skip multi-tokens, e.g., on Spanish UD bank
continue
assert len(parts) == 10, parts
lines.append(parts)
[_, tokens, _, tags, _, _, gov, gov_rel, _, _] = list(zip(*lines, strict=False))
triples = [
DepTriple(rel, int(gov)-1, dep)
for dep, (rel, gov) in enumerate(zip(gov_rel, gov, strict=False))
]
parse = UDParse(list(tokens), list(tags), triples)
yield sent_id, parse
sent_num += 1
[docs]
def get_udparse(sent: 'Sentence', tool: str) -> UDParse:
"""Create a ``UDParse`` from a sentence extracted from a Communication.
.. note::
This function requires the ``concrete`` package to be installed.
Parameters
----------
sent : Sentence
A Concrete Sentence object.
tool : str
The tool name to look for in dependency parse metadata.
Returns
-------
UDParse
The parsed representation of the sentence.
"""
# extract dependency parse for Communication.
triples = []
for ud_parse in sent.tokenization.dependencyParseList:
if ud_parse.metadata.tool == tool:
for dependency in ud_parse.dependencyList:
triples.append(DepTriple(dependency.edgeType,
dependency.gov, dependency.dep))
break
# Extract token strings
tokens = [x.text for x in sent.tokenization.tokenList.tokenList]
# Extract POS tags
tags = get_tags(sent.tokenization, 'POS')
#triples.sort(key=lambda triple: triple.dep)
parse = UDParse(tokens=tokens, tags=tags, triples=triples)
# Extract lemmas
#parse.lemmas = get_tags(sent.tokenization, 'LEMMA')
return parse