Source code for medacy.pipeline_components.annotation.gold_annotator_component

from ..base import BaseComponent
from medacy.tools import Annotations
from spacy.tokens import Token
import logging

[docs]class GoldAnnotatorComponent(BaseComponent):
    #TODO CLEAN ME
    #TODO Look into spacy GoldParse
    #TODO This code really needs a fixing but it is bootstrapped to work from development
    """
    A pipeline component that overlays gold annotations. This pipeline component
    sets the attribute 'gold_label' to all tokens to be used as the class value of the token
    when fed into a supervised learning algorithm.

    """

    name = "gold_annotator"
    dependencies = []

    def __init__(self, spacy_pipeline, labels):
        """
        :param spacy_pipeline: An exisiting spacy Language processing pipeline
        :param labels: The subset of labels from the gold annotations to restrict labeling to.
        """
        self.nlp = spacy_pipeline
        self.labels = labels
        self.failed_overlay_count = 0
        self.failed_identifying_span_count = 0
        Token.set_extension('gold_label', default="O", force=True)

[docs]    def find_span(self, start, end, label, span, doc):
        """
        Greedily searches characters around word to find a valid set of tokens the annotation likely corresponds to.
        :param start: index of token start
        :param end: index of token end
        :param label:
        :param span:
        :param doc:
        :return:
        """
        greedy_searched_span = doc.char_span(start, end)
        if greedy_searched_span is not None:
            return greedy_searched_span

        greedy_searched_span = doc.char_span(start, end-1) #annotation may have extended over an ending blank space
        if greedy_searched_span is not None:
            return greedy_searched_span

        #No clue - increase boundaries incrementally until a valid span is found.
        i = 0
        while greedy_searched_span is None and i <= 20:
            if i % 2 == 0:
                end += 1
            else:
                start-=1
            i+=1
            greedy_searched_span = doc.char_span(start, end)

        return greedy_searched_span



    def __call__(self, doc):
        nlp = self.nlp

        if hasattr(doc._, 'file_name'):
            logging.debug("%s: Called GoldAnnotator Component", doc._.file_name)

        if logging.getLogger().getEffectiveLevel() == logging.DEBUG: #print document tokenization
            for token in doc:
                logging.debug(str(token))

        #check if gold annotation file path has been set.
        if not hasattr(doc._, 'gold_annotation_file'):
            raise ValueError("No extension doc._.gold_annotation_file is present.")

        gold_annotations = Annotations(doc._.gold_annotation_file, annotation_type='ann')

        # for label in set([label for _,_,label in [gold['entities'][key] for key in gold['entities']]]):

        # for token in doc:
        #     print(token.text, token.idx)
        for e_label, e_start, e_end, _ in gold_annotations.get_entity_annotations():
            #print(e_label, e_start, e_end)
            if e_start > e_end:
                logging.critical("%s: Broken annotation - start is greater than end: (%i,%i,%s)",doc._.file_name, e_start, e_end, e_label)
                continue
            span = doc.char_span(e_start, e_end)

            if span is None:
                self.failed_overlay_count += 1
                self.failed_identifying_span_count += 1
                logging.warning("%s: Number of failed annotation overlays with current tokenizer: %i (%i,%i,%s)", doc._.file_name, self.failed_overlay_count, e_start, e_end, e_label)
            fixed_span = self.find_span(e_start, e_end, e_label, span, doc)
            if fixed_span is not None:
                if span is None:
                    logging.warning("%s: Fixed span (%i,%i,%s) into: %s", doc._.file_name, e_start, e_end, e_label,fixed_span.text)
                    self.failed_identifying_span_count -= 1
                for token in fixed_span:
                    if e_label in self.labels or not self.labels:
                        token._.set('gold_label', e_label)

            else: #annotation was not able to be fixed, it will be ignored - this is bad in evaluation.
                logging.warning("%s: Could not fix annotation: (%i,%i,%s)",doc._.file_name, e_start, e_end, e_label)
                logging.warning("%s: Total Failed Annotations: %i", doc._.file_name, self.failed_identifying_span_count)

        if self.failed_overlay_count > .3*gold_annotations.get_entity_count() :
            logging.warning("%s: Annotations may mis-aligned as more than 30 percent failed to overlay: %s", doc._.file_name, doc._.gold_annotation_file)


        return doc