diff --git a/applications/bc5cdr/chemicals.py b/applications/bc5cdr/chemicals.py index 1cc805c..7f957d9 100644 --- a/applications/bc5cdr/chemicals.py +++ b/applications/bc5cdr/chemicals.py @@ -1,9 +1,3 @@ -import re -import functools -import itertools -import collections -import pandas as pd - import collections from trove.labelers.tools import * from trove.labelers.labeling import * @@ -213,7 +207,7 @@ def lfs(self, train_sentences, top_k=10, active_tiers=None): target_concepts = [sty for sty in self.class_map if self.class_map[sty] == 1] specialist_1 = load_specialist_abbrvs(fpath, - umls,3 + umls, target_concepts=target_concepts, filter_ambiguous=True) target_concepts = [sty for sty in self.class_map if @@ -451,7 +445,7 @@ def lfs(self, train_sentences, top_k=10, active_tiers=None): r'''((alpha|beta|gamma)[-][T])''', re.compile(r'''(PG[-]9|U[-]II)'''), re.compile(r'''(BPO|GSH|DFU|CsA|Srl|HOE|GVG|PAN|NMDA)'''), - re.compile(r'''(TCR|MZ|HBsAg|AraG|LR132|SSRI[s]*|HBeAg|LR132|BD10[0-9]{2}|GNC92H2|SSR103800|CGRP)'''), + re.compile(r'''(TCR|MZ|HBsAg|AraG|LR132|SSRI[s]*|HBeAg|BD10[0-9]{2}|GNC92H2|SSR103800|CGRP)'''), # peptides and proteins with less than 15 amino acids ARE annotated r'''(angiotensin([- ]ii)*)''', r'''(u[- ]ii|urotensin[- ]ii)''', @@ -493,7 +487,8 @@ def lfs(self, train_sentences, top_k=10, active_tiers=None): # ---------------------------------------------------------------------- # Hyphen token # ---------------------------------------------------------------------- - def get_subtokens(dictionary, split_chars=['-'], min_occur=20): + def get_subtokens(dictionary, split_chars=None, min_occur=20): + split_chars = ['-'] if not split_chars else split_chars freq = collections.Counter() for term in dictionary: for ch in split_chars: diff --git a/preprocessing/pipes/tokenizers.py b/preprocessing/pipes/tokenizers.py index b4afaae..817387c 100644 --- a/preprocessing/pipes/tokenizers.py +++ b/preprocessing/pipes/tokenizers.py @@ -313,7 +313,7 @@ def ct_tokenizer(nlp): :param nlp: :return: """ - prefix_re = re.compile(r'''^([\["'()*+-?/\<\>#%]+|[><][=])+''') + prefix_re = re.compile(r'''^([\["'()*+-?/<>#%]+|[><][=])+''') suffix_re = re.compile(r'''([\]"'),-.:;*]|'s)$''') infix_re = re.compile(r'''[%(),-./;=?]+''') # spaCy SBD break w/o [.] diff --git a/trove/contrib/labelers/clinical/family.py b/trove/contrib/labelers/clinical/family.py index 4ad868a..a1f4d8a 100644 --- a/trove/contrib/labelers/clinical/family.py +++ b/trove/contrib/labelers/clinical/family.py @@ -17,7 +17,7 @@ OTHER = 2 ABSTAIN = 0 -rgx_relatives = re.compile(r'''\b(((grand)*(mother|father)|grand(m|p)a)([']*s)*|((parent|(daught|sist|broth)er|son|cousin)([']*s)*))\b''', re.I) +rgx_relatives = re.compile(r'''\b(((grand)*(mother|father)|grand([mp])a)([']*s)*|((parent|(daught|sist|broth)er|son|cousin)([']*s)*))\b''', re.I) def LF_relative(span): diff --git a/trove/contrib/labelers/clinical/taggers.py b/trove/contrib/labelers/clinical/taggers.py index f6b090a..107cc69 100644 --- a/trove/contrib/labelers/clinical/taggers.py +++ b/trove/contrib/labelers/clinical/taggers.py @@ -1,7 +1,7 @@ import re from itertools import product -from inkfish.data.dataloaders.contexts import Span, Relation from collections import defaultdict, namedtuple +from trove.dataloaders.contexts import Span, Relation def get_text(words, offsets): @@ -111,7 +111,7 @@ def dict_matcher(sentence, # ignore whitespace when matching dictionary terms text = span.text if ignore_whitespace: - text = re.sub(r'''\s{2,}|\n{1,}''', ' ', span.text).strip() + text = re.sub(r'''\s{2,}|\n+''', ' ', span.text).strip() # search for matches in all dictionaries for name in dictionaries: diff --git a/trove/metrics/analysis.py b/trove/metrics/analysis.py index 5fd0b8a..4aaf1b7 100644 --- a/trove/metrics/analysis.py +++ b/trove/metrics/analysis.py @@ -1,3 +1,4 @@ +import torch import numpy as np import scipy.sparse as sparse from scipy.sparse import issparse