import spacy, sklearn_crfsuite
from .base import BasePipeline
from medacy.model.feature_extractor import FeatureExtractor
from ..pipeline_components import GoldAnnotatorComponent, MetaMapComponent, CharacterTokenizer
from ..pipeline_components.lexicon import LexiconComponent
[docs]class DrugEventPipeline(BasePipeline):
def __init__(self, metamap=None, entities=[], lexicon={}):
"""
Init a pipeline for processing data related to identifying adverse drug events
:param metamap: instance of MetaMap
:param entities: entities to be identified, for this pipeline adverse drug events
:param lexicon: Dictionary with labels and their corresponding lexicons to match on
"""
description= "Pipeline for recognition of adverse drug events from the 2018/19 FDA OSE drug label challenge"
super().__init__("drug_event_pipeline",
spacy_pipeline=spacy.load("en_core_web_sm"),
description=description,
creators="Corey Sutphin",
organization="NLP@VCU")
self.entities = entities
#self.spacy_pipeline.tokenizer = self.get_tokenizer() # Currently using SpaCy's default tokenizer
self.add_component(GoldAnnotatorComponent, entities) # add overlay for GoldAnnotation
if metamap is not None:
self.add_component(MetaMapComponent, metamap, semantic_type_labels=['sosy', 'phpr', 'orga', 'npop', 'mobd', 'inpo', 'comd', 'biof', 'bdsu', 'acab'])
if lexicon is not None:
self.add_component(LexiconComponent, lexicon)
[docs] def get_learner(self):
return ("CRF_l2sgd", sklearn_crfsuite.CRF(
algorithm='l2sgd',
c2=0.1,
max_iterations=100,
all_possible_transitions=True
))
[docs] def get_tokenizer(self):
tokenizer = CharacterTokenizer(self.spacy_pipeline)
return tokenizer.tokenizer