-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
hextrato
committed
Mar 2, 2021
1 parent
02ba1d4
commit fdcfc4e
Showing
46 changed files
with
907 additions
and
48 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
import json | ||
import texas as tx | ||
import stanza | ||
|
||
# create a document | ||
|
||
TXLang = "pt" | ||
TXText = "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares." | ||
# TXText = "Boa noite!" | ||
|
||
TXSpacyModel = TXLang | ||
|
||
nlp = stanza.Pipeline(TXSpacyModel,processors='tokenize,pos,lemma') | ||
|
||
''' | ||
IGNORE THIS PART, IT'S JUST AN EXAMPLE... | ||
import stanza | ||
nlp = stanza.Pipeline("pt",processors='tokenize,pos,lemma') | ||
text = "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares. Boa noite!" | ||
doc = nlp(text) | ||
for sentence in doc.sentences: | ||
print(sentence.tokens) | ||
s = doc.sentences[0] | ||
for token in sentence.tokens: | ||
print(token) | ||
t = s.tokens[7] | ||
''' | ||
|
||
print ("Processing text: ",TXText) | ||
|
||
doc = nlp(TXText) | ||
|
||
index = -1 | ||
sentIndex = 0 | ||
nlpTokenList = [] | ||
nlpWordsList = [] | ||
nlpPOSList = [] | ||
nlpDEPList = [] | ||
nlpLemmaList = [] | ||
nlpSentenceEndPositions = [] | ||
hasCompoundWords = False | ||
|
||
for sentence in doc.sentences: | ||
sentIndex+=len(sentence.tokens) | ||
nlpSentenceEndPositions.append(sentIndex) | ||
for token in sentence.tokens: | ||
t = token | ||
index += 1 | ||
nlpTokenList.append(token.text) | ||
if len(token.words) == 1: | ||
# 1 word per token | ||
nlpWordsList.append(None) | ||
nlpPOSList.append(token.words[0].pos) | ||
nlpLemmaList.append(token.words[0].lemma) | ||
else: | ||
# N words per token | ||
hasCompoundWords = True | ||
tokenWords = [] | ||
tokenLemmas = [] | ||
tokenPOStags = [] | ||
for word in token.words: | ||
tokenWords.append(word.text) | ||
tokenLemmas.append(word.lemma) | ||
tokenPOStags.append(word.pos) | ||
nlpWordsList.append(tokenWords) | ||
nlpPOSList.append(tokenPOStags) | ||
nlpLemmaList.append(tokenLemmas) | ||
|
||
#for word in sentence.words: | ||
# w = word | ||
# # print(word.text, word.pos, word.lemma) # , word.ner) | ||
# index += 1 | ||
# nlpTokenList.append(word.text) | ||
# nlpPOSList.append(word.pos) | ||
# nlpLemmaList.append(word.lemma) | ||
|
||
print( "nlpTokenList" , len(nlpTokenList) , nlpTokenList) | ||
print( "nlpWordsList" , len(nlpWordsList) , nlpWordsList ) | ||
print( "nlpPOSList" , len(nlpPOSList) , nlpPOSList ) | ||
print( "nlpLemmaList" , len(nlpLemmaList) , nlpLemmaList ) | ||
|
||
mydoc1 = tx.Document(TXText, TXLang) | ||
# mydoc1.meta().set("generator","stanza") | ||
# mydoc1.meta().set("model",TXSpacyModel) | ||
|
||
mydoc1.setTokenList( nlpTokenList, indexed=True) | ||
mydoc1.views().get("TOKENS").meta().set("generator","stanza") | ||
mydoc1.views().get("TOKENS").meta().set("model",TXSpacyModel) | ||
mydoc1.setSentenceList( nlpSentenceEndPositions ) | ||
|
||
if hasCompoundWords: | ||
mydoc1.addTokenView( "WORDS", nlpWordsList ) | ||
mydoc1.addTokenView( "LEMMA", nlpLemmaList ) | ||
mydoc1.addTokenView( "POS", nlpPOSList ) | ||
|
||
# create another document reversing from the previous document JSON | ||
mydoc2 = tx.reverse(mydoc1.TAS()) | ||
|
||
print("==========") | ||
print("mydoc2") | ||
print("----------") | ||
print( "--- Document TAS" ) | ||
print( json.dumps(mydoc2.TAS()) ) | ||
print( "--- Token List" ) | ||
print( mydoc2.getTokenList() ) | ||
print( "--- Token Info" ) | ||
print( json.dumps( mydoc2.getTokenInfo() ) ) | ||
print( "--- Sentence Info" ) | ||
print( json.dumps( mydoc2.getSentenceInfo() ) ) | ||
|
||
print("") | ||
print("============") | ||
print("Tabular View") | ||
print("------------") | ||
myTabView = tx.UITabularView(mydoc2) | ||
if hasCompoundWords: | ||
myTabView.showView("WORDS") | ||
myTabView.showView("LEMMA", labelCSS=False) | ||
myTabView.showView("POS") | ||
print(myTabView.HTML()) | ||
print("------------") | ||
|
||
print("") | ||
print("end!") | ||
print("") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import json | ||
import texas as tx | ||
|
||
# create a document | ||
mydoc1 = tx.Document("Hello world!!! How are you today?", "en") | ||
mydoc1.meta().set("authors","hegler,yiwen,celine,yuqian") | ||
mydoc1.date().setTimestamp("2021-01-19T14:44") # ?? | ||
mydoc1.setTokenList( ["Hello", "world", "!","!","!","How","are","you","today","?"] ) | ||
mydoc1.addTokenView( "POS", ["?", "NOUN", "PUNCT","PUNCT","PUNCT","?","VERB","PRON","?","PUNCT"] ) | ||
mydoc1.setSentenceList( [5,10] ) | ||
|
||
# mydoc1.addSpanView( "NER0", [ {"label":"SOMETHING", "start_token":1, "final_token":2} ] ) | ||
mydoc1.addSpanView( "NER0", pType = "custom" ) | ||
mydoc1.addSpanAnns( "NER0", [ {"label":"SOMETHING", "start_token":1, "final_token":2} , {"label":"QUESTION", "start_token":9, "final_token":10} ] ) | ||
mydoc1.addSpanAnns( "NER0", {"label":"WORLD", "token_index":7} ) | ||
|
||
mydoc1.addRelationView( "RELATION0" , pType = "ignore") # , ["?", "NOUN", "PUNCT","PUNCT","PUNCT","?","VERB","PRON","?","PUNCT"] ) | ||
mydoc1.addRelationRoot( pViewName = "RELATION0" , pRelationName = "R0", pRootType = "predicate", pRootSpan = {"label":"be", "token_index":6} ) | ||
|
||
# create another document reversing from the previous document JSON | ||
mydoc2 = tx.reverse(mydoc1.TAS()) | ||
|
||
print("==========") | ||
print("mydoc2") | ||
print("----------") | ||
print( "--- Token List" ) | ||
print( mydoc2.getTokenList() ) | ||
print( "--- Token Info" ) | ||
print( json.dumps( mydoc2.getTokenInfo() ) ) | ||
print( "--- Sentence Info" ) | ||
print( json.dumps( mydoc2.getSentenceInfo() ) ) | ||
print( "--- Document TAS" ) | ||
print( json.dumps(mydoc2.TAS()) ) | ||
|
||
|
||
print("") | ||
print("end!") | ||
print("") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#!/usr/bin/python | ||
#-*- coding: utf-8 -*- | ||
|
||
class Annotation: | ||
def __init__(self): | ||
pass | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
#!/usr/bin/python | ||
#-*- coding: utf-8 -*- | ||
|
||
from .Annotation import Annotation | ||
from .CharAnnotation import CharAnnotation | ||
from .TokenAnnotation import TokenAnnotation | ||
from .SpanAnnotation import SpanAnnotation | ||
from .RelationAnnotation import RelationAnnotation | ||
|
||
class AnnotationSet: | ||
def __init__(self): | ||
self._anns = [] | ||
|
||
def add(self, pAnn:Annotation): | ||
if not isinstance(pAnn, Annotation): | ||
raise Exception("AnnotationSet 'pAnn' parameter class is required to be 'Annotation'"); | ||
self._anns.append(pAnn) | ||
|
||
def getAnns(self): | ||
return self._anns | ||
def anns(self): | ||
return self._anns | ||
|
||
def size(self): | ||
return len(self._anns) | ||
|
||
def TAS(self): | ||
d = [] | ||
for annotation in self.getAnns(): | ||
d.append( annotation.TAS() ) | ||
return d | ||
|
||
def reverse(self, jss: list): | ||
if jss is None: | ||
return | ||
if not type(jss) is list: | ||
raise Exception("AnnotationSet reverse 'jss' parameter is required to be 'list'"); | ||
for ann in jss: | ||
if not type(ann) is dict: | ||
raise Exception("Annotation in reverse 'anns' list is required to be 'dict'"); | ||
if not "type" in ann: | ||
raise Exception("Missing 'type' attribute in Annotation during reverse"); | ||
if ann["type"] == "char": | ||
if not "index" in ann: | ||
ann["index"] = None | ||
if not "label" in ann: | ||
raise Exception("Missing 'label' attribute in CharAnnotation during reverse"); | ||
if not "start_char" in ann: | ||
raise Exception("Missing 'start_char' attribute in CharAnnotation during reverse"); | ||
if not "final_char" in ann: | ||
raise Exception("Missing 'final_char' attribute in CharAnnotation during reverse"); | ||
self.getAnns().append (CharAnnotation(pStartChar = ann["start_char"], pFinalChar = ann["final_char"], pLabel = ann["label"], pIndex = ann["index"])) | ||
if ann["type"] == "token": | ||
if not "token_index" in ann: | ||
raise Exception("Missing 'token_index' attribute in CharAnnotation during reverse"); | ||
if not "label" in ann: | ||
raise Exception("Missing 'label' attribute in CharAnnotation during reverse"); | ||
self.getAnns().append (TokenAnnotation(pTokenIndex = ann["token_index"], pLabel = ann["label"])) | ||
if ann["type"] == "span": | ||
if not "start_token" in ann: | ||
raise Exception("Missing 'start_token' attribute in CharAnnotation during reverse"); | ||
if not "final_token" in ann: | ||
raise Exception("Missing 'final_token' attribute in CharAnnotation during reverse"); | ||
if not "label" in ann: | ||
raise Exception("Missing 'label' attribute in CharAnnotation during reverse"); | ||
annSpan = None | ||
if "span" in ann: | ||
annSpan = ann["span"] | ||
self.getAnns().append (SpanAnnotation(pStartToken = ann["start_token"], pFinalToken = ann["final_token"], pLabel = ann["label"], pSpan=annSpan)) | ||
|
||
|
||
''' | ||
self._anns = {} | ||
if jss is None: | ||
return | ||
if not type(jss) is dict: | ||
raise Exception("AnnotationViewSet reverse 'jss' parameter is required to be 'dict'"); | ||
for attr in jss: | ||
self._anns[attr] = AnnotationView.reverse(jss[attr]) | ||
''' |
Oops, something went wrong.