Skip to content

Commit

Permalink
multi-word per token
Browse files Browse the repository at this point in the history
  • Loading branch information
hextrato committed Mar 2, 2021
1 parent 02ba1d4 commit fdcfc4e
Show file tree
Hide file tree
Showing 46 changed files with 907 additions and 48 deletions.
2 changes: 1 addition & 1 deletion example000-corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import texas as tx

# create a document
mydoc1 = tx.Document("Hello world!!! How are you today?", "en")
mydoc1 = tx.Document("Hello world!!! How are you today?", "en") # pType = "statement")
mydoc1.meta().set("authors","hegler,yiwen,celine,yuqian")
mydoc1.date().setTimestamp("2021-01-19T14:44") # ??
mydoc1.setTokenList( ["Hello", "world", "!","!","!","How","are","you","today","?"] )
Expand Down
69 changes: 69 additions & 0 deletions example002-stanza.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,66 @@
import texas as tx
import stanza

# ignore all the following "playing" segments

'''
nlp = stanza.Pipeline("pt")
text = "Eu não gosto daquela laranja que vejo no pomar."
doc = nlp(text)
for sentence in doc.sentences:
print(sentence)
for token in doc.iter_tokens():
print(token)
UDPipe
nlpD = spacy_udpipe.load("pt")
# nlpD = spacy_udpipe.load("pt").tokenizer
text = "
u não gosto daquela laranja que vejo no pomar.
"
doc = nlpD(text)
for t in doc:
tok = t
print(t, t.idx)
for sentence in doc.sents:
sent = sentence
print(sentence)
spacy
nlp = spacy.load("pt_core_news_sm")
text = "Eu não gosto daquela laranja que vejo no pomar."
doc = nlp(text)
>>> Italian !!!
nlp = spacy.load("it_core_news_sm")
text = "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari."
doc = nlp(text)
for t in doc:
tok = t
print(t, t.idx, t.lemma_, t.pos_, t.tag_)
nlp = stanza.Pipeline("it")
text = "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari."
doc = nlp(text)
for sentence in doc.sentences:
print(sentence)
for token in doc.iter_tokens():
print(token)
'''

# create a document

TXLang = "en"
Expand Down Expand Up @@ -35,6 +95,15 @@
# mydoc1.meta().set("generator","stanza")
# mydoc1.meta().set("model",TXSpacyModel)

'''
text = "Eu não gosto daquela laranja que de vejo no pomar."
Eu não gosto "de+aquela" laranja que vejo em+o pomar .
"PREP+PRON"
["PREP","PRON"]
'''

mydoc1.setTokenList( nlpTokenList, indexed=True)
mydoc1.views().get("TOKENS").meta().set("generator","stanza")
mydoc1.views().get("TOKENS").meta().set("model",TXSpacyModel)
Expand Down
131 changes: 131 additions & 0 deletions example004-stanza-tabular-view.portuguese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import json
import texas as tx
import stanza

# create a document

TXLang = "pt"
TXText = "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares."
# TXText = "Boa noite!"

TXSpacyModel = TXLang

nlp = stanza.Pipeline(TXSpacyModel,processors='tokenize,pos,lemma')

'''
IGNORE THIS PART, IT'S JUST AN EXAMPLE...
import stanza
nlp = stanza.Pipeline("pt",processors='tokenize,pos,lemma')
text = "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares. Boa noite!"
doc = nlp(text)
for sentence in doc.sentences:
print(sentence.tokens)
s = doc.sentences[0]
for token in sentence.tokens:
print(token)
t = s.tokens[7]
'''

print ("Processing text: ",TXText)

doc = nlp(TXText)

index = -1
sentIndex = 0
nlpTokenList = []
nlpWordsList = []
nlpPOSList = []
nlpDEPList = []
nlpLemmaList = []
nlpSentenceEndPositions = []
hasCompoundWords = False

for sentence in doc.sentences:
sentIndex+=len(sentence.tokens)
nlpSentenceEndPositions.append(sentIndex)
for token in sentence.tokens:
t = token
index += 1
nlpTokenList.append(token.text)
if len(token.words) == 1:
# 1 word per token
nlpWordsList.append(None)
nlpPOSList.append(token.words[0].pos)
nlpLemmaList.append(token.words[0].lemma)
else:
# N words per token
hasCompoundWords = True
tokenWords = []
tokenLemmas = []
tokenPOStags = []
for word in token.words:
tokenWords.append(word.text)
tokenLemmas.append(word.lemma)
tokenPOStags.append(word.pos)
nlpWordsList.append(tokenWords)
nlpPOSList.append(tokenPOStags)
nlpLemmaList.append(tokenLemmas)

#for word in sentence.words:
# w = word
# # print(word.text, word.pos, word.lemma) # , word.ner)
# index += 1
# nlpTokenList.append(word.text)
# nlpPOSList.append(word.pos)
# nlpLemmaList.append(word.lemma)

print( "nlpTokenList" , len(nlpTokenList) , nlpTokenList)
print( "nlpWordsList" , len(nlpWordsList) , nlpWordsList )
print( "nlpPOSList" , len(nlpPOSList) , nlpPOSList )
print( "nlpLemmaList" , len(nlpLemmaList) , nlpLemmaList )

mydoc1 = tx.Document(TXText, TXLang)
# mydoc1.meta().set("generator","stanza")
# mydoc1.meta().set("model",TXSpacyModel)

mydoc1.setTokenList( nlpTokenList, indexed=True)
mydoc1.views().get("TOKENS").meta().set("generator","stanza")
mydoc1.views().get("TOKENS").meta().set("model",TXSpacyModel)
mydoc1.setSentenceList( nlpSentenceEndPositions )

if hasCompoundWords:
mydoc1.addTokenView( "WORDS", nlpWordsList )
mydoc1.addTokenView( "LEMMA", nlpLemmaList )
mydoc1.addTokenView( "POS", nlpPOSList )

# create another document reversing from the previous document JSON
mydoc2 = tx.reverse(mydoc1.TAS())

print("==========")
print("mydoc2")
print("----------")
print( "--- Document TAS" )
print( json.dumps(mydoc2.TAS()) )
print( "--- Token List" )
print( mydoc2.getTokenList() )
print( "--- Token Info" )
print( json.dumps( mydoc2.getTokenInfo() ) )
print( "--- Sentence Info" )
print( json.dumps( mydoc2.getSentenceInfo() ) )

print("")
print("============")
print("Tabular View")
print("------------")
myTabView = tx.UITabularView(mydoc2)
if hasCompoundWords:
myTabView.showView("WORDS")
myTabView.showView("LEMMA", labelCSS=False)
myTabView.showView("POS")
print(myTabView.HTML())
print("------------")

print("")
print("end!")
print("")

39 changes: 39 additions & 0 deletions example005-relations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import json
import texas as tx

# create a document
mydoc1 = tx.Document("Hello world!!! How are you today?", "en")
mydoc1.meta().set("authors","hegler,yiwen,celine,yuqian")
mydoc1.date().setTimestamp("2021-01-19T14:44") # ??
mydoc1.setTokenList( ["Hello", "world", "!","!","!","How","are","you","today","?"] )
mydoc1.addTokenView( "POS", ["?", "NOUN", "PUNCT","PUNCT","PUNCT","?","VERB","PRON","?","PUNCT"] )
mydoc1.setSentenceList( [5,10] )

# mydoc1.addSpanView( "NER0", [ {"label":"SOMETHING", "start_token":1, "final_token":2} ] )
mydoc1.addSpanView( "NER0", pType = "custom" )
mydoc1.addSpanAnns( "NER0", [ {"label":"SOMETHING", "start_token":1, "final_token":2} , {"label":"QUESTION", "start_token":9, "final_token":10} ] )
mydoc1.addSpanAnns( "NER0", {"label":"WORLD", "token_index":7} )

mydoc1.addRelationView( "RELATION0" , pType = "ignore") # , ["?", "NOUN", "PUNCT","PUNCT","PUNCT","?","VERB","PRON","?","PUNCT"] )
mydoc1.addRelationRoot( pViewName = "RELATION0" , pRelationName = "R0", pRootType = "predicate", pRootSpan = {"label":"be", "token_index":6} )

# create another document reversing from the previous document JSON
mydoc2 = tx.reverse(mydoc1.TAS())

print("==========")
print("mydoc2")
print("----------")
print( "--- Token List" )
print( mydoc2.getTokenList() )
print( "--- Token Info" )
print( json.dumps( mydoc2.getTokenInfo() ) )
print( "--- Sentence Info" )
print( json.dumps( mydoc2.getSentenceInfo() ) )
print( "--- Document TAS" )
print( json.dumps(mydoc2.TAS()) )


print("")
print("end!")
print("")

2 changes: 1 addition & 1 deletion texas/TXCorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ class Corpus(TextAnnotationSchema):
def __init__(self, pLang : str = None):
""" Set TEXAS type as 'corpus' and text = '' """
super(Corpus, self).__init__(pText = None, pLang = pLang, pType = "corpus")

self.setTexasClass("tx.Corpus")
5 changes: 3 additions & 2 deletions texas/TXDocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

class Document(TextAnnotationSchema):

def __init__(self, pText : str, pLang : str = None):
def __init__(self, pText : str, pLang : str = None, pType : str = "document"):
""" Set TEXAS type as 'document' """
super(Document, self).__init__(pText = pText, pLang = pLang, pType = "document")
super(Document, self).__init__(pText = pText, pLang = pLang, pType = pType)
self.setTexasClass("tx.Document")

1 change: 0 additions & 1 deletion texas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@
# from .annotations import *
from .TXCorpus import Corpus
from .TXDocument import Document
from .TXQuestion import Question
from .core.TextAnnotationSchema import reverse
from .ui.TabularView import UITabularView
Binary file modified texas/__pycache__/TXCorpus.cpython-38.pyc
Binary file not shown.
Binary file modified texas/__pycache__/TXDocument.cpython-38.pyc
Binary file not shown.
Binary file modified texas/__pycache__/__init__.cpython-38.pyc
Binary file not shown.
7 changes: 7 additions & 0 deletions texas/anns/Annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

class Annotation:
def __init__(self):
pass

80 changes: 80 additions & 0 deletions texas/anns/AnnotationSet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-

from .Annotation import Annotation
from .CharAnnotation import CharAnnotation
from .TokenAnnotation import TokenAnnotation
from .SpanAnnotation import SpanAnnotation
from .RelationAnnotation import RelationAnnotation

class AnnotationSet:
def __init__(self):
self._anns = []

def add(self, pAnn:Annotation):
if not isinstance(pAnn, Annotation):
raise Exception("AnnotationSet 'pAnn' parameter class is required to be 'Annotation'");
self._anns.append(pAnn)

def getAnns(self):
return self._anns
def anns(self):
return self._anns

def size(self):
return len(self._anns)

def TAS(self):
d = []
for annotation in self.getAnns():
d.append( annotation.TAS() )
return d

def reverse(self, jss: list):
if jss is None:
return
if not type(jss) is list:
raise Exception("AnnotationSet reverse 'jss' parameter is required to be 'list'");
for ann in jss:
if not type(ann) is dict:
raise Exception("Annotation in reverse 'anns' list is required to be 'dict'");
if not "type" in ann:
raise Exception("Missing 'type' attribute in Annotation during reverse");
if ann["type"] == "char":
if not "index" in ann:
ann["index"] = None
if not "label" in ann:
raise Exception("Missing 'label' attribute in CharAnnotation during reverse");
if not "start_char" in ann:
raise Exception("Missing 'start_char' attribute in CharAnnotation during reverse");
if not "final_char" in ann:
raise Exception("Missing 'final_char' attribute in CharAnnotation during reverse");
self.getAnns().append (CharAnnotation(pStartChar = ann["start_char"], pFinalChar = ann["final_char"], pLabel = ann["label"], pIndex = ann["index"]))
if ann["type"] == "token":
if not "token_index" in ann:
raise Exception("Missing 'token_index' attribute in CharAnnotation during reverse");
if not "label" in ann:
raise Exception("Missing 'label' attribute in CharAnnotation during reverse");
self.getAnns().append (TokenAnnotation(pTokenIndex = ann["token_index"], pLabel = ann["label"]))
if ann["type"] == "span":
if not "start_token" in ann:
raise Exception("Missing 'start_token' attribute in CharAnnotation during reverse");
if not "final_token" in ann:
raise Exception("Missing 'final_token' attribute in CharAnnotation during reverse");
if not "label" in ann:
raise Exception("Missing 'label' attribute in CharAnnotation during reverse");
annSpan = None
if "span" in ann:
annSpan = ann["span"]
self.getAnns().append (SpanAnnotation(pStartToken = ann["start_token"], pFinalToken = ann["final_token"], pLabel = ann["label"], pSpan=annSpan))


'''
self._anns = {}
if jss is None:
return
if not type(jss) is dict:
raise Exception("AnnotationViewSet reverse 'jss' parameter is required to be 'dict'");
for attr in jss:
self._anns[attr] = AnnotationView.reverse(jss[attr])
'''
Loading

0 comments on commit fdcfc4e

Please sign in to comment.