multi-word per token

HeglerTissot · Mar 2, 2021 · fdcfc4e · fdcfc4e
1 parent 02ba1d4
commit fdcfc4e
Show file tree

Hide file tree

Showing 46 changed files with 907 additions and 48 deletions.
diff --git a/example000-corpus.py b/example000-corpus.py
@@ -2,7 +2,7 @@
 import texas as tx
 
 # create a document
-mydoc1 = tx.Document("Hello world!!! How are you today?", "en")
+mydoc1 = tx.Document("Hello world!!! How are you today?", "en") # pType = "statement")
 mydoc1.meta().set("authors","hegler,yiwen,celine,yuqian")
 mydoc1.date().setTimestamp("2021-01-19T14:44") # ??
 mydoc1.setTokenList( ["Hello", "world", "!","!","!","How","are","you","today","?"] )

diff --git a/example002-stanza.py b/example002-stanza.py
@@ -2,6 +2,66 @@
 import texas as tx
 import stanza
 
+# ignore all the following "playing" segments
+
+'''
+nlp = stanza.Pipeline("pt")
+text = "Eu não gosto daquela laranja que vejo no pomar."
+doc = nlp(text)
+for sentence in doc.sentences:
+    print(sentence)
+for token in doc.iter_tokens():
+    print(token)
+
+UDPipe
+nlpD = spacy_udpipe.load("pt")
+# nlpD = spacy_udpipe.load("pt").tokenizer
+text = "
+u não gosto daquela laranja que vejo no pomar.
+"
+doc = nlpD(text)
+
+for t in doc:
+    tok = t
+    print(t, t.idx)
+    
+for sentence in doc.sents:
+    sent = sentence
+    print(sentence)
+
+
+spacy
+nlp = spacy.load("pt_core_news_sm")
+text = "Eu não gosto daquela laranja que vejo no pomar."
+doc = nlp(text)
+
+>>> Italian !!!
+
+nlp = spacy.load("it_core_news_sm")
+
+text = "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari."
+
+doc = nlp(text)
+
+for t in doc:
+    tok = t
+    print(t, t.idx, t.lemma_, t.pos_, t.tag_)
+
+
+
+nlp = stanza.Pipeline("it")
+
+text = "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari."
+
+doc = nlp(text)
+
+for sentence in doc.sentences:
+    print(sentence)
+for token in doc.iter_tokens():
+    print(token)
+
+'''
+
 # create a document
 
 TXLang = "en"
@@ -35,6 +95,15 @@
 # mydoc1.meta().set("generator","stanza")
 # mydoc1.meta().set("model",TXSpacyModel)
 
+'''
+text = "Eu não gosto daquela laranja que de vejo no pomar."
+
+Eu não gosto "de+aquela" laranja que vejo em+o pomar .
+             "PREP+PRON"
+             ["PREP","PRON"]
+
+'''
+
 mydoc1.setTokenList( nlpTokenList, indexed=True)
 mydoc1.views().get("TOKENS").meta().set("generator","stanza")
 mydoc1.views().get("TOKENS").meta().set("model",TXSpacyModel)

diff --git a/example004-stanza-tabular-view.portuguese.py b/example004-stanza-tabular-view.portuguese.py
@@ -0,0 +1,131 @@
+import json
+import texas as tx
+import stanza
+
+# create a document
+
+TXLang = "pt"
+TXText = "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares."
+# TXText = "Boa noite!"
+
+TXSpacyModel = TXLang
+
+nlp = stanza.Pipeline(TXSpacyModel,processors='tokenize,pos,lemma')
+
+'''
+
+IGNORE THIS PART, IT'S JUST AN EXAMPLE...
+
+import stanza
+nlp = stanza.Pipeline("pt",processors='tokenize,pos,lemma')
+text = "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares. Boa noite!"
+doc = nlp(text)
+
+for sentence in doc.sentences:
+    print(sentence.tokens)
+s = doc.sentences[0]
+    
+for token in sentence.tokens:
+    print(token)
+t = s.tokens[7]
+
+'''
+
+print ("Processing text: ",TXText)
+
+doc = nlp(TXText)
+
+index = -1
+sentIndex = 0
+nlpTokenList = []
+nlpWordsList = []
+nlpPOSList = []
+nlpDEPList = []
+nlpLemmaList = []
+nlpSentenceEndPositions = []
+hasCompoundWords = False
+
+for sentence in doc.sentences:
+    sentIndex+=len(sentence.tokens)
+    nlpSentenceEndPositions.append(sentIndex)
+    for token in sentence.tokens:
+        t = token
+        index += 1
+        nlpTokenList.append(token.text)
+        if len(token.words) == 1:
+            # 1 word per token
+            nlpWordsList.append(None)
+            nlpPOSList.append(token.words[0].pos)
+            nlpLemmaList.append(token.words[0].lemma)
+        else:
+            # N words per token
+            hasCompoundWords = True
+            tokenWords = []
+            tokenLemmas = []
+            tokenPOStags = []
+            for word in token.words:
+                tokenWords.append(word.text)
+                tokenLemmas.append(word.lemma)
+                tokenPOStags.append(word.pos)
+            nlpWordsList.append(tokenWords)
+            nlpPOSList.append(tokenPOStags)
+            nlpLemmaList.append(tokenLemmas)
+
+    #for word in sentence.words:
+    #    w = word
+    #    # print(word.text, word.pos, word.lemma) # , word.ner) 
+    #    index += 1
+    #    nlpTokenList.append(word.text)
+    #    nlpPOSList.append(word.pos)
+    #    nlpLemmaList.append(word.lemma)
+
+print( "nlpTokenList" , len(nlpTokenList) , nlpTokenList)
+print( "nlpWordsList" , len(nlpWordsList) , nlpWordsList )
+print( "nlpPOSList" , len(nlpPOSList) , nlpPOSList )
+print( "nlpLemmaList" , len(nlpLemmaList) , nlpLemmaList )
+
+mydoc1 = tx.Document(TXText, TXLang)
+# mydoc1.meta().set("generator","stanza")
+# mydoc1.meta().set("model",TXSpacyModel)
+
+mydoc1.setTokenList( nlpTokenList, indexed=True)
+mydoc1.views().get("TOKENS").meta().set("generator","stanza")
+mydoc1.views().get("TOKENS").meta().set("model",TXSpacyModel)
+mydoc1.setSentenceList( nlpSentenceEndPositions )
+
+if hasCompoundWords:
+    mydoc1.addTokenView( "WORDS", nlpWordsList )
+mydoc1.addTokenView( "LEMMA", nlpLemmaList )
+mydoc1.addTokenView( "POS", nlpPOSList )
+
+# create another document reversing from the previous document JSON 
+mydoc2 = tx.reverse(mydoc1.TAS())
+
+print("==========")
+print("mydoc2")
+print("----------")
+print( "--- Document TAS" )
+print( json.dumps(mydoc2.TAS()) )
+print( "--- Token List" )
+print( mydoc2.getTokenList() )
+print( "--- Token Info" )
+print( json.dumps( mydoc2.getTokenInfo() ) )
+print( "--- Sentence Info" )
+print( json.dumps( mydoc2.getSentenceInfo() ) )
+
+print("")
+print("============")
+print("Tabular View")
+print("------------")
+myTabView = tx.UITabularView(mydoc2)
+if hasCompoundWords:
+    myTabView.showView("WORDS")
+myTabView.showView("LEMMA", labelCSS=False)
+myTabView.showView("POS")
+print(myTabView.HTML())
+print("------------")
+
+print("")
+print("end!")
+print("")
+
diff --git a/example005-relations.py b/example005-relations.py
@@ -0,0 +1,39 @@
+import json
+import texas as tx
+
+# create a document
+mydoc1 = tx.Document("Hello world!!! How are you today?", "en")
+mydoc1.meta().set("authors","hegler,yiwen,celine,yuqian")
+mydoc1.date().setTimestamp("2021-01-19T14:44") # ??
+mydoc1.setTokenList( ["Hello", "world", "!","!","!","How","are","you","today","?"] )
+mydoc1.addTokenView( "POS", ["?", "NOUN", "PUNCT","PUNCT","PUNCT","?","VERB","PRON","?","PUNCT"] )
+mydoc1.setSentenceList( [5,10] )
+
+# mydoc1.addSpanView( "NER0", [ {"label":"SOMETHING", "start_token":1, "final_token":2} ] )
+mydoc1.addSpanView( "NER0", pType = "custom" )
+mydoc1.addSpanAnns( "NER0", [ {"label":"SOMETHING", "start_token":1, "final_token":2} , {"label":"QUESTION", "start_token":9, "final_token":10} ] )
+mydoc1.addSpanAnns( "NER0", {"label":"WORLD", "token_index":7} )
+
+mydoc1.addRelationView( "RELATION0" , pType = "ignore") # , ["?", "NOUN", "PUNCT","PUNCT","PUNCT","?","VERB","PRON","?","PUNCT"] )
+mydoc1.addRelationRoot( pViewName = "RELATION0" , pRelationName = "R0", pRootType = "predicate", pRootSpan = {"label":"be", "token_index":6} ) 
+
+# create another document reversing from the previous document JSON 
+mydoc2 = tx.reverse(mydoc1.TAS())
+
+print("==========")
+print("mydoc2")
+print("----------")
+print( "--- Token List" )
+print( mydoc2.getTokenList() )
+print( "--- Token Info" )
+print( json.dumps( mydoc2.getTokenInfo() ) )
+print( "--- Sentence Info" )
+print( json.dumps( mydoc2.getSentenceInfo() ) )
+print( "--- Document TAS" )
+print( json.dumps(mydoc2.TAS()) )
+
+
+print("")
+print("end!")
+print("")
+
diff --git a/texas/TXCorpus.py b/texas/TXCorpus.py
@@ -8,4 +8,4 @@ class Corpus(TextAnnotationSchema):
     def __init__(self, pLang : str = None):
         """ Set TEXAS type as 'corpus' and text = '' """
         super(Corpus, self).__init__(pText = None, pLang = pLang, pType = "corpus")
-
+        self.setTexasClass("tx.Corpus")
diff --git a/texas/TXDocument.py b/texas/TXDocument.py
@@ -5,7 +5,8 @@
 
 class Document(TextAnnotationSchema):
 
-    def __init__(self, pText : str, pLang : str = None):
+    def __init__(self, pText : str, pLang : str = None, pType : str = "document"):
         """ Set TEXAS type as 'document' """
-        super(Document, self).__init__(pText = pText, pLang = pLang, pType = "document")
+        super(Document, self).__init__(pText = pText, pLang = pLang, pType = pType)
+        self.setTexasClass("tx.Document")
 
diff --git a/texas/__init__.py b/texas/__init__.py
@@ -3,6 +3,5 @@
 # from .annotations import *
 from .TXCorpus   import Corpus
 from .TXDocument import Document
-from .TXQuestion import Question
 from .core.TextAnnotationSchema import reverse
 from .ui.TabularView import UITabularView
diff --git a/texas/__pycache__/TXCorpus.cpython-38.pyc b/texas/__pycache__/TXCorpus.cpython-38.pyc
diff --git a/texas/__pycache__/TXDocument.cpython-38.pyc b/texas/__pycache__/TXDocument.cpython-38.pyc
diff --git a/texas/__pycache__/__init__.cpython-38.pyc b/texas/__pycache__/__init__.cpython-38.pyc
diff --git a/texas/anns/Annotation.py b/texas/anns/Annotation.py
@@ -0,0 +1,7 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+class Annotation:
+    def __init__(self):
+        pass
+
diff --git a/texas/anns/AnnotationSet.py b/texas/anns/AnnotationSet.py
@@ -0,0 +1,80 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+from .Annotation import Annotation
+from .CharAnnotation import CharAnnotation
+from .TokenAnnotation import TokenAnnotation
+from .SpanAnnotation import SpanAnnotation
+from .RelationAnnotation import RelationAnnotation
+
+class AnnotationSet:
+    def __init__(self):
+        self._anns = []
+
+    def add(self, pAnn:Annotation):
+        if not isinstance(pAnn, Annotation):
+           raise Exception("AnnotationSet 'pAnn' parameter class is required to be 'Annotation'");
+        self._anns.append(pAnn)
+
+    def getAnns(self):
+        return self._anns
+    def anns(self):
+        return self._anns
+
+    def size(self):
+        return len(self._anns)
+
+    def TAS(self):
+        d = []
+        for annotation in self.getAnns():
+            d.append( annotation.TAS() )
+        return d
+
+    def reverse(self, jss: list):
+        if jss is None:
+            return
+        if not type(jss) is list:
+            raise Exception("AnnotationSet reverse 'jss' parameter is required to be 'list'");
+        for ann in jss:
+            if not type(ann) is dict:
+                raise Exception("Annotation in reverse 'anns' list is required to be 'dict'");
+            if not "type" in ann:
+                raise Exception("Missing 'type' attribute in Annotation during reverse");
+            if ann["type"] == "char":
+                if not "index" in ann:
+                    ann["index"] = None
+                if not "label" in ann:
+                    raise Exception("Missing 'label' attribute in CharAnnotation during reverse");
+                if not "start_char" in ann:
+                    raise Exception("Missing 'start_char' attribute in CharAnnotation during reverse");
+                if not "final_char" in ann:
+                    raise Exception("Missing 'final_char' attribute in CharAnnotation during reverse");
+                self.getAnns().append (CharAnnotation(pStartChar = ann["start_char"], pFinalChar = ann["final_char"], pLabel = ann["label"], pIndex = ann["index"]))
+            if ann["type"] == "token":
+                if not "token_index" in ann:
+                    raise Exception("Missing 'token_index' attribute in CharAnnotation during reverse");
+                if not "label" in ann:
+                    raise Exception("Missing 'label' attribute in CharAnnotation during reverse");
+                self.getAnns().append (TokenAnnotation(pTokenIndex = ann["token_index"], pLabel = ann["label"]))
+            if ann["type"] == "span":
+                if not "start_token" in ann:
+                    raise Exception("Missing 'start_token' attribute in CharAnnotation during reverse");
+                if not "final_token" in ann:
+                    raise Exception("Missing 'final_token' attribute in CharAnnotation during reverse");
+                if not "label" in ann:
+                    raise Exception("Missing 'label' attribute in CharAnnotation during reverse");
+                annSpan = None
+                if "span" in ann:
+                    annSpan = ann["span"]
+                self.getAnns().append (SpanAnnotation(pStartToken = ann["start_token"], pFinalToken = ann["final_token"], pLabel = ann["label"], pSpan=annSpan))
+
+
+        '''
+        self._anns = {}
+        if jss is None:
+            return
+        if not type(jss) is dict:
+            raise Exception("AnnotationViewSet reverse 'jss' parameter is required to be 'dict'");
+        for attr in jss:
+            self._anns[attr] = AnnotationView.reverse(jss[attr])
+        '''