Source code for decomp.semantics.predpatt.parsing.loader

"""Load different sources of data.

This module provides functions to load dependency parses from various formats,
particularly focusing on CoNLL-U format files.
"""

from __future__ import annotations

import codecs
import os
from collections.abc import Iterator
from typing import TYPE_CHECKING


if TYPE_CHECKING:
    from concrete import Sentence, Tokenization

from ..parsing.udparse import DepTriple, UDParse



[docs]
def load_comm(
    filename: str,
    tool: str = 'ud converted ptb trees using pyStanfordDependencies'
) -> Iterator[tuple[str, UDParse]]:
    """Load a concrete communication file with required pyStanfordDependencies output.
    
    .. warning::
       This function is part of a planned parsing feature that is not yet fully supported.
       It requires the ``concrete`` package (available via ``pip install decomp[parsing]``).
       Full parsing functionality with modern UD parsers will be added in a future release.

    Parameters
    ----------
    filename : str
        Path to the concrete communication file.
    tool : str, optional
        The tool name to look for in the dependency parse metadata.

    Yields
    ------
    tuple[str, UDParse]
        Tuples of (section_label, parse) for each sentence.
    
    Raises
    ------
    ImportError
        If the concrete package is not installed.
    """
    try:
        # import here to avoid requiring concrete
        from concrete.util.file_io import read_communication_from_file
    except ImportError as e:
        raise ImportError(
            "The 'concrete' package is required to use load_comm(). "
            "Install it with: pip install concrete"
        ) from e
    
    comm = read_communication_from_file(filename)
    if comm.sectionList:
        for sec in comm.sectionList:
            if sec.sentenceList:
                for sent in sec.sentenceList:
                    yield sec.label, get_udparse(sent, tool)




[docs]
def load_conllu(filename_or_content: str) -> Iterator[tuple[str, UDParse]]:
    """Load CoNLL-U style files (e.g., the Universal Dependencies treebank).

    Parameters
    ----------
    filename_or_content : str
        Either a path to a CoNLL-U file or the content string itself.

    Yields
    ------
    tuple[str, UDParse]
        Tuples of (sentence_id, parse) for each sentence in the file.

    Notes
    -----
    - Sentence IDs default to "sent_N" where N starts at 1
    - Lines starting with "# sent_id" override the sentence ID
    - Other comment lines (starting with #) are used as ID if no sent_id found
    - Multi-token lines (with '-' in first column) are skipped
    - Expects 10 tab-separated columns per data line
    """
    sent_num = 1
    try:
        if os.path.isfile(filename_or_content):
            with codecs.open(filename_or_content, encoding='utf-8') as f:
                content = f.read().strip()
        else:
            content = filename_or_content.strip()
    except ValueError:
        # work around an issue on windows: `os.path.isfile` will call `os.stat`,
        # which throws a ValueError if the "filename" is too long. Possibly
        # a python bug in that this could be caught in os.path.isfile? Though
        # I found some related issues where discussion suggests it was deemed
        # not a bug.
        content = filename_or_content.strip()

    for block in content.split('\n\n'):
        block = block.strip()
        if not block:
            continue
        lines = []
        sent_id = f'sent_{sent_num}'
        has_sent_id = 0
        for line in block.split('\n'):
            if line.startswith('#'):
                if line.startswith('# sent_id'):
                    sent_id = line[10:].strip()
                    has_sent_id = 1
                else:
                    if not has_sent_id:   # don't take subsequent comments as sent_id
                        sent_id = line[1:].strip()
                continue
            parts = line.split('\t') # data appears to use '\t'
            if '-' in parts[0]:      # skip multi-tokens, e.g., on Spanish UD bank
                continue
            assert len(parts) == 10, parts
            lines.append(parts)
        [_, tokens, _, tags, _, _, gov, gov_rel, _, _] = list(zip(*lines, strict=False))
        triples = [
            DepTriple(rel, int(gov)-1, dep)
            for dep, (rel, gov) in enumerate(zip(gov_rel, gov, strict=False))
        ]
        parse = UDParse(list(tokens), list(tags), triples)
        yield sent_id, parse
        sent_num += 1




[docs]
def get_tags(tokenization: 'Tokenization', tagging_type: str = 'POS') -> list[str]:
    """Extract tags of a specific type from a tokenization.
    
    .. note::
       This function requires the ``concrete`` package to be installed.

    Parameters
    ----------
    tokenization : Tokenization
        A Concrete tokenization object.
    tagging_type : str, optional
        The type of tagging to extract (default: 'POS').

    Returns
    -------
    list[str]
        List of tags in token order.
    """
    for token_tagging in tokenization.tokenTaggingList:
        if token_tagging.taggingType == tagging_type:
            idx2pos = {taggedToken.tokenIndex: taggedToken.tag
                       for taggedToken in token_tagging.taggedTokenList}
            return [idx2pos[idx] for idx in sorted(idx2pos.keys())]
    # Return empty list if no matching tagging type found
    return []




[docs]
def get_udparse(sent: 'Sentence', tool: str) -> UDParse:
    """Create a ``UDParse`` from a sentence extracted from a Communication.
    
    .. note::
       This function requires the ``concrete`` package to be installed.

    Parameters
    ----------
    sent : Sentence
        A Concrete Sentence object.
    tool : str
        The tool name to look for in dependency parse metadata.

    Returns
    -------
    UDParse
        The parsed representation of the sentence.
    """
    # extract dependency parse for Communication.
    triples = []
    for ud_parse in sent.tokenization.dependencyParseList:
        if ud_parse.metadata.tool == tool:
            for dependency in ud_parse.dependencyList:
                triples.append(DepTriple(dependency.edgeType,
                                         dependency.gov, dependency.dep))
            break

    # Extract token strings
    tokens = [x.text for x in sent.tokenization.tokenList.tokenList]

    # Extract POS tags
    tags = get_tags(sent.tokenization, 'POS')

    #triples.sort(key=lambda triple: triple.dep)
    parse = UDParse(tokens=tokens, tags=tags, triples=triples)

    # Extract lemmas
    #parse.lemmas = get_tags(sent.tokenization, 'LEMMA')

    return parse