Skip to content

Commit dd3673f

Browse files
authored
Merge pull request #171 from JohnSnowLabs/add-scala-annotatars
add examples scala
2 parents 40a8cf0 + 24274bb commit dd3673f

File tree

5 files changed

+297
-0
lines changed

5 files changed

+297
-0
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
object AssertionDLApproachExample extends App{
2+
3+
implicit val session=spark
4+
5+
val testDS=Seq(
6+
"Has a past history of gastroenteritis and stomach pain, however patient shows no stomach pain now. "+
7+
"We don't care about gastroenteritis here, but we do care about heart failure. "+
8+
"Test for asma, no asma.").toDF("text")
9+
val reader=new NegexDatasetReader
10+
11+
val datasetPath="src/test/resources/rsAnnotations-1-120-random.txt"
12+
val trainDS=reader.readDataframe(datasetPath).withColumnRenamed("sentence","text").cache
13+
14+
15+
val documentAssembler=new DocumentAssembler()
16+
.setInputCol("text")
17+
.setOutputCol("document")
18+
19+
val sentenceDetector=new SentenceDetector()
20+
.setInputCols(Array("document"))
21+
.setOutputCol("sentence")
22+
23+
val tokenizer=new Tokenizer()
24+
.setInputCols(Array("sentence"))
25+
.setOutputCol("token")
26+
27+
val POSTag=PerceptronModel
28+
.pretrained()
29+
.setInputCols("sentence","token")
30+
.setOutputCol("pos")
31+
32+
val chunker=new Chunker()
33+
.setInputCols(Array("pos","sentence"))
34+
.setOutputCol("chunk")
35+
.setRegexParsers(Array("(<NN>)+"))
36+
37+
val pubmed=WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")
38+
.setInputCols("sentence","token")
39+
.setOutputCol("embeddings")
40+
.setCaseSensitive(false)
41+
42+
43+
val assertionStatus=new AssertionDLApproach()
44+
.setGraphFolder("src/main/resources/assertion_dl/")
45+
.setInputCols("sentence","chunk","embeddings")
46+
.setOutputCol("assertion")
47+
.setStartCol("start")
48+
.setEndCol("end")
49+
.setLabelCol("label")
50+
.setLearningRate(0.01f)
51+
.setDropout(0.15f)
52+
.setBatchSize(16)
53+
.setEpochs(3)
54+
.setValidationSplit(0.2f)
55+
56+
val stages=Array(documentAssembler,sentenceDetector,tokenizer,POSTag,chunker,pubmed,
57+
assertionStatus)
58+
59+
// train Assertion Status
60+
val pipeline=new Pipeline()
61+
.setStages(stages)
62+
63+
val model=pipeline.fit(trainDS)
64+
model.write.overwrite().save("./tmp_assertiondl_negex")
65+
val outDf=model.transform(testDS)
66+
outDf.show(truncate=false)
67+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
object AssertionFiltererExample extends App{
2+
3+
implicit val session=spark
4+
5+
val testDS=Seq(
6+
"Has a past history of gastroenteritis and stomach pain, however patient shows no stomach pain now. "+
7+
"We don't care about gastroenteritis here, but we do care about heart failure. "+
8+
"Test for asma, no asma.").toDF("text")
9+
val reader=new NegexDatasetReader
10+
11+
val datasetPath="src/test/resources/rsAnnotations-1-120-random.txt"
12+
val trainDS=reader.readDataframe(datasetPath).withColumnRenamed("sentence","text").cache
13+
14+
15+
val documentAssembler=new DocumentAssembler()
16+
.setInputCol("text")
17+
.setOutputCol("document")
18+
19+
val sentenceDetector=new SentenceDetector()
20+
.setInputCols(Array("document"))
21+
.setOutputCol("sentence")
22+
23+
val tokenizer=new Tokenizer()
24+
.setInputCols(Array("sentence"))
25+
.setOutputCol("token")
26+
27+
val POSTag=PerceptronModel
28+
.pretrained()
29+
.setInputCols("sentence","token")
30+
.setOutputCol("pos")
31+
32+
val chunker=new Chunker()
33+
.setInputCols(Array("pos","sentence"))
34+
.setOutputCol("chunk")
35+
.setRegexParsers(Array("(<NN>)+"))
36+
37+
val pubmed=WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")
38+
.setInputCols("sentence","token")
39+
.setOutputCol("embeddings")
40+
.setCaseSensitive(false)
41+
42+
val assertionStatus=AssertionDLModel.pretrained("assertion_dl","en","clinical/models")
43+
.setInputCols("sentence","chunk","embeddings")
44+
.setOutputCol("assertion")
45+
.setIncludeConfidence(true)
46+
47+
48+
val assertionFilterer=new AssertionFilterer()
49+
.setInputCols("sentence","chunk","assertion")
50+
.setOutputCol("filtered")
51+
.setCriteria("assertion")
52+
.setWhiteList("present")
53+
54+
val stages=Array(documentAssembler,sentenceDetector,tokenizer,POSTag,chunker,pubmed,
55+
assertionStatus,assertionFilterer)
56+
val pipeline=new Pipeline()
57+
.setStages(stages)
58+
val model=pipeline.fit(trainDS)
59+
model.write.overwrite().save("./tmp_assertiondl_negex")
60+
val outDf=model.transform(testDS)
61+
62+
63+
outDf.selectExpr("filtered").show(truncate=false)
64+
outDf.selectExpr("assertion").show(truncate=false)
65+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
object DeidentificationExample extends App{
2+
3+
val trainDataSet=CoNLL().readDataset(SparkAccessor.spark,"src/test/resources/de-identification/train_dataset_main_small.csv")
4+
var nerDlModel=NerDLModel.pretrained().setOutputCol("ner").setInputCols("sentence","token","glove")
5+
var nerCrfModel=NerCrfModel.pretrained().setOutputCol("ner").setInputCols("sentence","token","pos","glove")
6+
val embeddingsFile="src/test/resources/ner-corpus/embeddings.100d.test.txt"
7+
8+
val emptyDataset=Seq(
9+
""
10+
).toDS.toDF("text")
11+
12+
val documentAssembler=new DocumentAssembler()
13+
.setInputCol("text")
14+
.setOutputCol("document")
15+
16+
val sentenceDetector=new SentenceDetector()
17+
.setInputCols(Array("document"))
18+
.setOutputCol("sentence")
19+
.setUseAbbreviations(true)
20+
21+
val tokenizer=new Tokenizer()
22+
.setInputCols(Array("sentence"))
23+
.setOutputCol("token")
24+
25+
26+
val embeddings=WordEmbeddingsModel
27+
.pretrained("embeddings_clinical","en","clinical/models")
28+
.setInputCols(Array("sentence","token"))
29+
.setOutputCol("embeddings")
30+
31+
val clinical_sensitive_entities=NerDLModel.pretrained("ner_deid_synthetic","en","clinical/models")
32+
.setInputCols(Array("sentence","token","embeddings")).setOutputCol("ner")
33+
34+
val nerConverter=new NerConverter()
35+
.setInputCols(Array("sentence","token","ner"))
36+
.setOutputCol("ner_chunk")
37+
38+
val deIdentification=new DeIdentification()
39+
.setInputCols(Array("ner_chunk","token","sentence"))
40+
.setOutputCol("dei")
41+
.setConsistentObfuscation(true)
42+
.setMode("obfuscate")
43+
.setObfuscateRefSource("faker")
44+
45+
46+
val pipeline=new Pipeline()
47+
.setStages(Array(
48+
documentAssembler,
49+
sentenceDetector,
50+
tokenizer,
51+
embeddings,
52+
clinical_sensitive_entities,
53+
nerConverter,
54+
deIdentification
55+
)).fit(emptyDataset)
56+
57+
val testDataset=Seq(
58+
"Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : "+
59+
"Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street"
60+
).toDS.toDF("text")
61+
62+
val deIdentificationDataFrame=pipeline.transform(testDataset)
63+
val dataframe=deIdentificationDataFrame.select("dei.result")
64+
dataframe.show(truncate=false)
65+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
object NerChunkerFiltererExample extends App{
2+
3+
val data=ResourceHelper.spark.createDataFrame(Seq(Tuple1("My name Andres and I live in Colombia"))).toDF("text")
4+
5+
val documentAssembler=new DocumentAssembler()
6+
.setInputCol("text")
7+
.setOutputCol("document")
8+
9+
val sentenceDetector=new SentenceDetector()
10+
.setInputCols("document")
11+
.setOutputCol("sentence")
12+
.setUseAbbreviations(false)
13+
14+
val tokenizer=new Tokenizer()
15+
.setInputCols(Array("sentence"))
16+
.setOutputCol("token")
17+
18+
val embeddings=WordEmbeddingsModel.pretrained()
19+
.setInputCols("sentence","token")
20+
.setOutputCol("embeddings")
21+
.setCaseSensitive(false)
22+
23+
val ner=NerDLModel.pretrained()
24+
.setInputCols("sentence","token","embeddings")
25+
.setOutputCol("ner")
26+
.setIncludeConfidence(true)
27+
ner.getClasses
28+
29+
val chunker=new NerChunker()
30+
.setInputCols(Array("sentence","ner"))
31+
.setOutputCol("ner_chunk")
32+
.setRegexParsers(Array("<PER>.*<LOC>"))
33+
34+
35+
val recursivePipeline=new RecursivePipeline()
36+
.setStages(Array(
37+
documentAssembler,
38+
sentenceDetector,
39+
tokenizer,
40+
embeddings,
41+
ner,
42+
chunker
43+
))
44+
45+
val nermodel=recursivePipeline.fit(data).transform(data)
46+
47+
48+
val dataframe=nermodel.select("ner_chunk.result")
49+
dataframe.show(truncate=false)
50+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
object NerConverterInternalExample extends App{
2+
3+
val data=ResourceHelper.spark.createDataFrame(Seq(Tuple1("My name is Andres and I live in Colombia"))).toDF("text")
4+
5+
val documentAssembler=new DocumentAssembler()
6+
.setInputCol("text")
7+
.setOutputCol("document")
8+
9+
val sentenceDetector=new SentenceDetector()
10+
.setInputCols("document")
11+
.setOutputCol("sentence")
12+
.setUseAbbreviations(false)
13+
14+
val tokenizer=new Tokenizer()
15+
.setInputCols(Array("sentence"))
16+
.setOutputCol("token")
17+
18+
val embeddings=WordEmbeddingsModel.pretrained()
19+
.setInputCols("document","token")
20+
.setOutputCol("embeddings")
21+
.setCaseSensitive(false)
22+
23+
val ner=NerDLModel.pretrained()
24+
.setInputCols("sentence","token","embeddings")
25+
.setOutputCol("ner")
26+
.setIncludeConfidence(true)
27+
28+
val converter=new NerConverterInternal()
29+
.setInputCols("sentence","token","ner")
30+
.setOutputCol("entities")
31+
.setPreservePosition(false)
32+
.setThreshold(9900e-4f)
33+
34+
val recursivePipeline=new RecursivePipeline()
35+
.setStages(Array(
36+
documentAssembler,
37+
sentenceDetector,
38+
tokenizer,
39+
embeddings,
40+
ner,
41+
converter
42+
))
43+
44+
val nermodel=recursivePipeline.fit(data).transform(data)
45+
46+
nermodel.select("token.result").show(1,false)
47+
nermodel.select("embeddings.result").show(1,false)
48+
nermodel.select("entities.result").show(1,false)
49+
nermodel.select("entities").show(1,false)
50+
}

0 commit comments

Comments
 (0)