import spacy, sklearn_crfsuite
from .base import BasePipeline
from ..pipeline_components import ClinicalTokenizer
from medacy.model.feature_extractor import FeatureExtractor
from ..pipeline_components import GoldAnnotatorComponent, MetaMapComponent, UnitComponent, MetaMap
[docs]class ClinicalPipeline(BasePipeline):
"""
A pipeline for clinical named entity recognition. A special tokenizer that breaks down a clinical document
to character level tokens defines this pipeline.
"""
def __init__(self, metamap=None, entities=[]):
"""
Create a pipeline with the name 'clinical_pipeline' utilizing
by default spaCy's small english model.
:param metamap: an instance of MetaMap if metamap should be used, defaults to None.
"""
description="""Pipeline tuned for the extraction of ADE related entities from the 2018 N2C2 Shared Task"""
super().__init__("clinical_pipeline",
spacy_pipeline=spacy.load("en_core_web_sm"),
description=description,
creators="Andriy Mulyar (andriymulyar.com)", #append if multiple creators
organization="NLP@VCU"
)
self.entities = entities
self.spacy_pipeline.tokenizer = self.get_tokenizer() #set tokenizer
self.add_component(GoldAnnotatorComponent, entities) #add overlay for GoldAnnotation
if metamap is not None and isinstance(metamap, MetaMap):
self.add_component(MetaMapComponent, metamap)
self.add_component(UnitComponent)
[docs] def get_learner(self):
return ("CRF_l2sgd", sklearn_crfsuite.CRF(
algorithm='l2sgd',
c2=0.1,
max_iterations=100,
all_possible_transitions=True
))
[docs] def get_tokenizer(self):
tokenizer = ClinicalTokenizer(self.spacy_pipeline)
return tokenizer.tokenizer