Skip to content

Commit

Permalink
Working NER tagger Akka system, #110
Browse files Browse the repository at this point in the history
  • Loading branch information
kudkudak committed Apr 21, 2014
1 parent 6d2b2cc commit 5ed1573
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class MantisKafkaFetcherBasic extends Actor {
//Encoding for JSON parsing
implicit val enc = Encodings.`UTF-8`
//Stop fetching thread when exceedes
val maximumQueueSize = 1000
val maximumQueueSize = 100
//Queue to store messages
val Q = new mutable.SynchronizedQueue[scala.collection.mutable.Map[String, AnyRef]]()

Expand Down Expand Up @@ -111,9 +111,13 @@ class MantisKafkaFetcherBasic extends Actor {
//TODO: improve
if (!tagged_in_current_topic.contains(uuid)) {
var entry = scala.collection.mutable.HashMap[String, AnyRef]()
entry += "title" -> msg.title.as[String]
entry += "summary" -> msg.summary.as[String]
entry += "text" -> msg.text.as[String]

//KafkaActor should act as a filter for garbage. It HAS to parse, and also
//has to improve quality. Those are Unicode decoded from UTF-8!

entry += "title" -> msg.title.as[String].split("\\r?\\n").map(_.trim).mkString(" ")
entry += "summary" -> msg.summary.as[String].split("\\r?\\n").map(_.trim).mkString(" ")
entry += "text" -> msg.text.as[String].split("\\r?\\n").map(_.trim).mkString(" ")
entry += "uuid" -> msg.uuid.as[String]
Q.enqueue(entry)
}
Expand All @@ -124,12 +128,12 @@ class MantisKafkaFetcherBasic extends Actor {
case e: Exception => println("Failed parsing consumer message offset=" + msgoffset.offset.toString+" "+msgoffset.message.toString)
}

if(Q.length % 100 == 0){
if(Q.length % 1000 == 0){
println("Already enqueued "+Q.length.toString+" news")
}

while(Q.length > maximumQueueSize)
java.lang.Thread.sleep(100)
java.lang.Thread.sleep(1000)

}
}
Expand Down
3 changes: 3 additions & 0 deletions mantis_shrimp/src/main/scala/mantisshrimp/main.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ package mantisshrimp
import akka.actor.{Props, ActorSystem}
import ner._


//TODO: write tests

object Main extends App{

def runSystem = {
Expand Down
6 changes: 2 additions & 4 deletions mantis_shrimp/src/main/scala/ner/SevenClassNERTagger.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,11 @@ class SevenClassNERTagger extends NERTagger {
val serializedClassifier = "stanford_classifiers/english.muc.7class.distsim.crf.ser.gz"
val classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier)



def tag(text: String): Seq[MantisTag] = {

val keywords = ListBuffer[(String, String)]()
val tag_list = ListBuffer[MantisTag]()

val out = classifier.classifyWithInlineXML(text)
val out = classifier.classifyWithInlineXML(scala.xml.Utility.escape(text))

try{
val xml = scala.xml.XML.loadString("<s>" + out + "</s>")
Expand All @@ -37,6 +34,7 @@ class SevenClassNERTagger extends NERTagger {
println("Failed parsing XML out of Stanford")
println("Input text "+text)
println(out.toString)
println("Error "+e.toString)

}
}
Expand Down

0 comments on commit 5ed1573

Please sign in to comment.