Skip to content

Commit

Permalink
Merge pull request #2 from wdemis/tika-1.7-withUnitTesting-feature
Browse files Browse the repository at this point in the history
Added significant unit testing, fleshed out max output property, and …
  • Loading branch information
Timothy Spann authored Feb 21, 2018
2 parents 720cd03 + af38551 commit a154e2c
Show file tree
Hide file tree
Showing 36 changed files with 2,691 additions and 602 deletions.
2 changes: 1 addition & 1 deletion build.sh
Original file line number Diff line number Diff line change
@@ -1 +1 @@
mvn install -DskipTests
mvn install
1 change: 1 addition & 0 deletions nifi-extracttext-nar/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
</parent>

<artifactId>nifi-extracttext-nar</artifactId>
<version>1.0</version>
<packaging>nar</packaging>
<properties>
<maven.javadoc.skip>true</maven.javadoc.skip>
Expand Down
142 changes: 56 additions & 86 deletions nifi-extracttext-nar/target/classes/META-INF/DEPENDENCIES

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion nifi-extracttext-nar/target/classes/META-INF/NOTICE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

nifi-extracttext-nar
Copyright 2017 Apache NiFi Project
Copyright 2018 Apache NiFi Project

This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
Expand Down
2 changes: 1 addition & 1 deletion nifi-extracttext-nar/target/maven-archiver/pom.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#Generated by Maven
#Mon Feb 06 10:24:56 EST 2017
#Wed Feb 21 16:18:54 EST 2018
version=1.0
groupId=com.dataflowdeveloper
artifactId=nifi-extracttext-nar

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

nifi-extracttext-nar
Copyright 2017 Apache NiFi Project
Copyright 2018 Apache NiFi Project

This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
Expand Down
142 changes: 56 additions & 86 deletions nifi-extracttext-nar/target/test-classes/META-INF/DEPENDENCIES

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion nifi-extracttext-nar/target/test-classes/META-INF/NOTICE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

nifi-extracttext-nar
Copyright 2017 Apache NiFi Project
Copyright 2018 Apache NiFi Project

This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
Expand Down
70 changes: 22 additions & 48 deletions nifi-extracttext-processors/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<artifactId>extracttext-processor</artifactId>
<version>1.0</version>
</parent>

<artifactId>nifi-extracttext-processors</artifactId>
<packaging>jar</packaging>

Expand All @@ -32,7 +32,7 @@
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-processor-utils</artifactId>
<artifactId>nifi-utils</artifactId>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
Expand All @@ -49,51 +49,25 @@
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.14</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.14</version>
</dependency>

<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-langdetect</artifactId>
<version>1.14</version>
</dependency>


<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>3.5.0</version>
</dependency>

<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>

<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.8</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>2.0.8</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.17</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.17</version>
</dependency>
</dependencies>
<repositories>
<repository>
<id>jitpack.io</id>
<url>https://jitpack.io</url>
</repository>
</repositories>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,18 @@
*/
package com.dataflowdeveloper.processors.process;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.nifi.annotation.behavior.ReadsAttribute;
import org.apache.nifi.annotation.behavior.ReadsAttributes;
Expand All @@ -42,6 +46,7 @@
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.io.StreamCallback;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;

Expand All @@ -52,8 +57,16 @@
@WritesAttributes({ @WritesAttribute(attribute = "", description = "") })
public class ExtractTextProcessor extends AbstractProcessor {

public static final String ATTRIBUTE_OUTPUT_NAME = "body";

public static final PropertyDescriptor MAX_TEXT_LENGTH = new PropertyDescriptor
.Builder().name("MAX_TEXT_LENGTH")
.displayName("Max Output Text Length")
.description("The maximum length of text to retrieve. This is used to limit memory usage for dealing with large files. Specify -1 for unlimited length.")
.required(false)
.defaultValue("-1")
.addValidator(StandardValidators.INTEGER_VALIDATOR)
.expressionLanguageSupported(false)
.build();

public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success")
.description("Successfully determine sentiment.").build();

Expand All @@ -66,6 +79,7 @@ public class ExtractTextProcessor extends AbstractProcessor {
@Override
protected void init(final ProcessorInitializationContext context) {
final List<PropertyDescriptor> descriptors = new ArrayList<PropertyDescriptor>();
descriptors.add(MAX_TEXT_LENGTH);
this.descriptors = Collections.unmodifiableList(descriptors);

final Set<Relationship> relationships = new HashSet<Relationship>();
Expand Down Expand Up @@ -95,25 +109,43 @@ public void onTrigger(final ProcessContext context, final ProcessSession session
if (flowFile == null) {
flowFile = session.create();
}

final int maxTextLength = context.getProperty(MAX_TEXT_LENGTH).asInteger();
final String filename = flowFile.getAttribute("filename");

try {
flowFile.getAttributes();
flowFile = session.putAttribute(flowFile, "mime.type", "application/json");
flowFile = session.write(flowFile, new StreamCallback() {
final AtomicReference<String> type = new AtomicReference<>();
final AtomicReference<Boolean> wasError = new AtomicReference<>(false);

flowFile= session.write(flowFile, new StreamCallback() {
@Override
public void process(InputStream inputStream, OutputStream outputStream) throws IOException {
BufferedInputStream buffStream = new BufferedInputStream(inputStream);
Tika tika = new Tika();
String text = "";
try {
text = tika.parseToString(inputStream);
type.set(tika.detect(buffStream, filename));
tika.setMaxStringLength(maxTextLength);
text = tika.parseToString(buffStream);
} catch (TikaException e) {
getLogger().error("Apache Tika failed to parse input " + e.getLocalizedMessage());
e.printStackTrace();
wasError.set(true);
return;
}
// TODO: wrap in JSON???

outputStream.write(text.getBytes());
buffStream.close();
}
});
session.transfer(flowFile, REL_SUCCESS);

if (wasError.get()) {
session.transfer(flowFile, REL_FAILURE);
} else {
Map<String, String> mimeAttrs = new HashMap<String, String>() {{ put("mime.type", "text/plain"); put("orig.mime.type", type.get()); }};
flowFile = session.putAllAttributes(flowFile, mimeAttrs);
session.transfer(flowFile, REL_SUCCESS);
}
session.commit();
} catch (final Throwable t) {
getLogger().error("Unable to process ExtractTextProcessor file " + t.getLocalizedMessage());
Expand Down
Loading

0 comments on commit a154e2c

Please sign in to comment.