Source code for medacy.pipeline_components.metamap.metamap_component
from medacy.pipeline_components.metamap.metamap import MetaMap
from spacy.tokens import Token
from ..base import BaseComponent
import warnings,logging
[docs]class MetaMapComponent(BaseComponent):
"""
A pipeline component for SpaCy that overlays Metamap output as token attributes
"""
name = "metamap_annotator"
dependencies = []
def __init__(self, spacy_pipeline, metamap, cuis=True, semantic_type_labels = ['orch', 'phsu'], merge_tokens=False):
"""
Initializes a pipeline component that annotates MetaMap output onto a spacy doc object.
:param spacy_pipeline: an instance of a spacy language pipeline.
:param metamap: an instance of MetaMap.
:param cuis: Overlay CUIS from metamap output - one feature taking on multiple categorical values representing cuis.
:param semantic_type_labels: Semantic type labels to check for- generates a feature for each semantic type label.
"""
super().__init__(self.name, self.dependencies)
self.nlp = spacy_pipeline
assert isinstance(metamap, MetaMap), "MetamapComponent requires a MetaMap instance as an argument."
self.metamap = metamap
self.cuis = cuis
self.semantic_type_labels = semantic_type_labels
self.merge_tokens = merge_tokens
def __call__(self, doc):
"""
Runs a document to the metamap_annotator pipeline component. This overlays rich medical features by utilizing
MetaMap output and aligning it with a passed spacy Doc object. By medaCy conventions, each overlayed feature
is available as a token extension starting with 'feature_'. This component overlays 'feature_cui' and a
separate boolean feature for each semantic type to detect available under 'feature_is_{type}". This component
was originally designed to increase recall on Drug entities hence by default 'feature_is_orch' and
'feature_is_phsu' where orch and phsu are semantic types corresponding to organic chemicals and pharmalogical
substances respectively.
:param doc: document to run through pipeline
:return:
"""
logging.debug("Called MetaMap Component")
metamap = self.metamap
nlp = self.nlp
semantic_type_labels = self.semantic_type_labels
#register all extensions
if self.cuis:
Token.set_extension('feature_cui', default="-1", force=True) #cui feature
for semantic_type_label in semantic_type_labels: #is_semantic type features
Token.set_extension('feature_is_' + semantic_type_label, default=False, force=True)
#check if pre-metamapped file has been assigned to the document
if hasattr(doc._, 'metamapped_file'):
metamap_dict = metamap.load(doc._.metamapped_file)
else:
if hasattr(doc._, 'file_name'):
logging.debug("%s: Could not find metamap file for document." % doc._.file_name)
metamap_dict = metamap.map_text(doc.text) #TODO metamap.map_text is broken currently
if not hasattr(doc._, 'file_name'): #TODO REMOVE when implemnting live model prediction
return doc
# TODO refactor second part of if statement when implementing live model prediction
if metamap_dict == '' or metamap_dict['metamap'] is None:
if hasattr(doc._, 'metamapped_file'):
warnings.warn("%s: This metamap file is invalid and cannot be parsed in MetaMapComponent: %s \n Ignore this warning if this is a unittest - all may be fine." % (doc._.file_name,doc._.metamapped_file))
else:
warnings.warn("Metamapping text on the fly failed - aborting. Try to pre-metamap with DataLoader.")
return doc
mapped_terms = metamap.extract_mapped_terms(metamap_dict) #parse terms out of mappings dictionary
spans = [] #for displaying NER output with displacy
#Overlays semantic type presence if the given semantic type is set in metamap span.
for semantic_type_label in semantic_type_labels:
entity_name = semantic_type_label
nlp.entity.add_label(entity_name) #register entity label
entity_tags = metamap.get_term_by_semantic_type(mapped_terms, include=[semantic_type_label])
entity_annotations = metamap.mapped_terms_to_spacy_ann(entity_tags, semantic_type_label)
with doc.retokenize() as retokenizer:
for start, end, label in [entity_annotations['entities'][key] for key in entity_annotations['entities'].keys()]:
span = doc.char_span(start, end, label=nlp.vocab.strings[entity_name])
#TODO spans are none when indices and token boundaries don't line up.
if span not in spans:
if span is not None:
logging.debug("Found from metamap: (label=%s,raw_text=\"%s\",location=(%i, %i))" % (label,span.text, start, end ) )
spans.append(span)
for token in span:
token._.set('feature_is_' + label, True)
if self.merge_tokens:
try:
retokenizer.merge(span)
except BaseException:
continue
else:
logging.debug("Metamap span could not be overlayed due to tokenization mis-match: (%i, %i)" % (start, end))
#adds labels for displaying NER output with displacy.
# for span in spans:
# try:
# doc.ents = list(doc.ents) + [span]
# except ValueError as error:
# logging.warning(str(error)) #This gets called when the same token may match multiple semantic types
#Overlays CUI of each term
if Token.has_extension('feature_cui'):
with doc.retokenize() as retokenizer:
for term in mapped_terms:
cui = term['CandidateCUI']
start, end = metamap.get_span_by_term(term)[0]
span = doc.char_span(start, end)
if span is not None:
for token in span:
token._.set('feature_cui', cui)
if self.merge_tokens:
try:
retokenizer.merge(span)
except BaseException:
continue
return doc