Skip to content

Commit

Permalink
Added HTML support
Browse files Browse the repository at this point in the history
  • Loading branch information
Timothy Spann committed Mar 11, 2018
1 parent a154e2c commit 97759b9
Show file tree
Hide file tree
Showing 18 changed files with 1,079 additions and 686 deletions.
6 changes: 3 additions & 3 deletions nifi-extracttext-nar/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
<parent>
<groupId>com.dataflowdeveloper</groupId>
<artifactId>extracttext-processor</artifactId>
<version>1.0</version>
<version>1.5</version>
</parent>

<artifactId>nifi-extracttext-nar</artifactId>
<version>1.0</version>
<version>1.5</version>
<packaging>nar</packaging>
<properties>
<maven.javadoc.skip>true</maven.javadoc.skip>
Expand All @@ -34,7 +34,7 @@
<dependency>
<groupId>com.dataflowdeveloper</groupId>
<artifactId>nifi-extracttext-processors</artifactId>
<version>1.0</version>
<version>1.5</version>
</dependency>
</dependencies>

Expand Down
2 changes: 1 addition & 1 deletion nifi-extracttext-nar/target/classes/META-INF/DEPENDENCIES
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ From: 'an unknown organization'
License: Public Domain

From: 'Apache NiFi Project' (http://nifi.apache.org/)
- nifi-extracttext-processors (http://nifi.apache.org/nifi-nar-bundles/extracttext-processor/nifi-extracttext-processors) com.dataflowdeveloper:nifi-extracttext-processors:jar:1.0
- nifi-extracttext-processors (http://nifi.apache.org/nifi-nar-bundles/extracttext-processor/nifi-extracttext-processors) com.dataflowdeveloper:nifi-extracttext-processors:jar:1.5
License: Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
- nifi-utils (http://nifi.apache.org/nifi-commons/nifi-utils) org.apache.nifi:nifi-utils:jar:1.5.0
License: Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
Expand Down
4 changes: 2 additions & 2 deletions nifi-extracttext-nar/target/maven-archiver/pom.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#Generated by Maven
#Wed Feb 21 16:18:54 EST 2018
version=1.0
#Sun Mar 11 10:53:51 EDT 2018
version=1.5
groupId=com.dataflowdeveloper
artifactId=nifi-extracttext-nar
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ From: 'an unknown organization'
License: Public Domain

From: 'Apache NiFi Project' (http://nifi.apache.org/)
- nifi-extracttext-processors (http://nifi.apache.org/nifi-nar-bundles/extracttext-processor/nifi-extracttext-processors) com.dataflowdeveloper:nifi-extracttext-processors:jar:1.0
- nifi-extracttext-processors (http://nifi.apache.org/nifi-nar-bundles/extracttext-processor/nifi-extracttext-processors) com.dataflowdeveloper:nifi-extracttext-processors:jar:1.5
License: Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
- nifi-utils (http://nifi.apache.org/nifi-commons/nifi-utils) org.apache.nifi:nifi-utils:jar:1.5.0
License: Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ From: 'an unknown organization'
License: Public Domain

From: 'Apache NiFi Project' (http://nifi.apache.org/)
- nifi-extracttext-processors (http://nifi.apache.org/nifi-nar-bundles/extracttext-processor/nifi-extracttext-processors) com.dataflowdeveloper:nifi-extracttext-processors:jar:1.0
- nifi-extracttext-processors (http://nifi.apache.org/nifi-nar-bundles/extracttext-processor/nifi-extracttext-processors) com.dataflowdeveloper:nifi-extracttext-processors:jar:1.5
License: Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
- nifi-utils (http://nifi.apache.org/nifi-commons/nifi-utils) org.apache.nifi:nifi-utils:jar:1.5.0
License: Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
Expand Down
2 changes: 1 addition & 1 deletion nifi-extracttext-processors/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
<parent>
<groupId>com.dataflowdeveloper</groupId>
<artifactId>extracttext-processor</artifactId>
<version>1.0</version>
<version>1.5</version>
</parent>

<artifactId>nifi-extracttext-processors</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package com.dataflowdeveloper.processors.process;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
Expand All @@ -29,6 +30,12 @@
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import org.apache.nifi.annotation.behavior.ReadsAttribute;
import org.apache.nifi.annotation.behavior.ReadsAttributes;
import org.apache.nifi.annotation.behavior.WritesAttribute;
Expand All @@ -49,29 +56,48 @@
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.xml.sax.SAXException;

@Tags({ "extracttextprocessor" })
@CapabilityDescription("Run Tika Text Extraction from PDF, Word, Excel")
@Tags({ "extracttextprocessortika" })
@CapabilityDescription("Run Apache Tika Text Extraction from PDF, Word, Excel. Parameter for HTML or TEXT output. Parameter for Maximum Length returned.")
@SeeAlso({})
@ReadsAttributes({ @ReadsAttribute(attribute = "", description = "") })
@WritesAttributes({ @WritesAttribute(attribute = "", description = "") })
public class ExtractTextProcessor extends AbstractProcessor {

public static final PropertyDescriptor MAX_TEXT_LENGTH = new PropertyDescriptor
.Builder().name("MAX_TEXT_LENGTH")
.displayName("Max Output Text Length")
.description("The maximum length of text to retrieve. This is used to limit memory usage for dealing with large files. Specify -1 for unlimited length.")
.required(false)
.defaultValue("-1")
.addValidator(StandardValidators.INTEGER_VALIDATOR)
.expressionLanguageSupported(false)
.build();

public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success")
.description("Successfully determine sentiment.").build();

public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
.description("Failed to determine sentiment.").build();
private static final String TEXT_PLAIN = "text/plain";
public static final String TEXT_HTML = "text/html";
public static final String HTML_FORMAT = "html";
public static final String TEXT_FORMAT = "text";

public static final String FIELD_HTML_OUTPUT = "HTML_OUTPUT";
public static final String FIELD_MAX_TEXT_LENGTH = "MAX_TEXT_LENGTH";
public static final String FIELD_SUCCESS = "success";
public static final String FIELD_FAILURE = "failure";

public static final PropertyDescriptor MAX_TEXT_LENGTH = new PropertyDescriptor.Builder()
.name(FIELD_MAX_TEXT_LENGTH).displayName("Max Output Text Length")
.description(
"The maximum length of text to retrieve. This is used to limit memory usage for dealing with large files. Specify -1 for unlimited length.")
.required(false).defaultValue("-1").addValidator(StandardValidators.INTEGER_VALIDATOR)
.expressionLanguageSupported(false).build();

public static final PropertyDescriptor HTML_OUTPUT = new PropertyDescriptor.Builder()
.name(FIELD_HTML_OUTPUT)
.displayName("HTML Output instead of text")
.description("Send html for HTML output or text for Text output")
.required(false).defaultValue(TEXT_FORMAT)
.addValidator(StandardValidators.NON_BLANK_VALIDATOR)
.expressionLanguageSupported(false).build();

public static final Relationship REL_SUCCESS = new Relationship.Builder().name(FIELD_SUCCESS)
.description("Successfully extract content.").build();

public static final Relationship REL_FAILURE = new Relationship.Builder().name(FIELD_FAILURE)
.description("Failed to extract content.").build();

private List<PropertyDescriptor> descriptors;
private Set<Relationship> relationships;
Expand All @@ -80,6 +106,7 @@ public class ExtractTextProcessor extends AbstractProcessor {
protected void init(final ProcessorInitializationContext context) {
final List<PropertyDescriptor> descriptors = new ArrayList<PropertyDescriptor>();
descriptors.add(MAX_TEXT_LENGTH);
descriptors.add(HTML_OUTPUT);
this.descriptors = Collections.unmodifiableList(descriptors);

final Set<Relationship> relationships = new HashSet<Relationship>();
Expand Down Expand Up @@ -109,40 +136,85 @@ public void onTrigger(final ProcessContext context, final ProcessSession session
if (flowFile == null) {
flowFile = session.create();
}

final int maxTextLength = context.getProperty(MAX_TEXT_LENGTH).asInteger();
final String outputMode = context.getProperty(HTML_OUTPUT).getValue();
final String filename = flowFile.getAttribute("filename");

try {
final AtomicReference<String> type = new AtomicReference<>();
final AtomicReference<Boolean> wasError = new AtomicReference<>(false);
flowFile= session.write(flowFile, new StreamCallback() {

flowFile = session.write(flowFile, new StreamCallback() {
@Override
public void process(InputStream inputStream, OutputStream outputStream) throws IOException {
BufferedInputStream buffStream = new BufferedInputStream(inputStream);
Tika tika = new Tika();
String text = "";
try {
type.set(tika.detect(buffStream, filename));
tika.setMaxStringLength(maxTextLength);
text = tika.parseToString(buffStream);

if (outputMode.equals(HTML_FORMAT)) {
// http://lifeinide.com/post/2013-10-18-convert-document-to-html-with-apache-tika/

ByteArrayOutputStream out = new ByteArrayOutputStream();
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
handler.setResult(new StreamResult(out));
ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler);

AutoDetectParser parser = new AutoDetectParser();
parser.parse(buffStream, handler1, new Metadata());
text = new String(out.toByteArray(), "UTF-8");
} else {
tika.setMaxStringLength(maxTextLength);
text = tika.parseToString(buffStream);
}

} catch (TikaException e) {
getLogger().error("Apache Tika failed to parse input " + e.getLocalizedMessage());
e.printStackTrace();
wasError.set(true);
return;
} catch (SAXException e) {
getLogger().error(
"Apache Tika failed to parse input on XML/HTML error " + e.getLocalizedMessage());
wasError.set(true);
} catch (TransformerConfigurationException e) {
getLogger().error(
"Apache Tika failed to parse input on XML/HTML error " + e.getLocalizedMessage());
wasError.set(true);
}

outputStream.write(text.getBytes());
buffStream.close();
}
});

if (wasError.get()) {
session.transfer(flowFile, REL_FAILURE);
} else {
Map<String, String> mimeAttrs = new HashMap<String, String>() {{ put("mime.type", "text/plain"); put("orig.mime.type", type.get()); }};
} else {

Map<String, String> mimeAttrs = null;

if (outputMode.equals(HTML_FORMAT)) {
mimeAttrs = new HashMap<String, String>() {
{
put("mime.type", TEXT_HTML);
put("orig.mime.type", type.get());
}
};
} else {
mimeAttrs = new HashMap<String, String>() {
{
put("mime.type", TEXT_PLAIN);
put("orig.mime.type", type.get());
}
};
}

flowFile = session.putAllAttributes(flowFile, mimeAttrs);
session.transfer(flowFile, REL_SUCCESS);
}
Expand All @@ -153,4 +225,4 @@ public void process(InputStream inputStream, OutputStream outputStream) throws I
throw t;
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,39 @@ public void when_running_processor_mime_type_should_be_discovered_for_pdf_input(
}
}

@Test
public void when_running_processor_mime_type_should_be_discovered_for_pdf_input_html() {

try {
final String filename = "simple.pdf";
MockFlowFile flowFile = testRunner.enqueue(new FileInputStream(new File("src/test/resources/" + filename)));
Map<String, String> attrs = new HashMap<String, String>() {{ put("filename", filename); }};

testRunner.setProperty(ExtractTextProcessor.FIELD_HTML_OUTPUT, ExtractTextProcessor.HTML_FORMAT);

flowFile.putAttributes(attrs);
} catch (FileNotFoundException e) {
e.printStackTrace();
}

testRunner.assertValid();
testRunner.run();

testRunner.assertAllFlowFilesTransferred(ExtractTextProcessor.REL_SUCCESS);
List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractTextProcessor.REL_SUCCESS);
for (MockFlowFile mockFile : successFiles) {

// for ( String attribute : mockFile.getAttributes().keySet() ) {
// System.out.println("Attribute:" + attribute + "=" + mockFile.getAttribute(attribute));
// }

mockFile.assertAttributeExists("mime.type");
mockFile.assertAttributeEquals("mime.type", "text/html");
mockFile.assertAttributeExists("orig.mime.type");
mockFile.assertAttributeEquals("orig.mime.type", "application/pdf");
}
}

@Test
public void when_running_processor_mime_type_should_be_discovered_for_doc_input() {

Expand Down Expand Up @@ -267,4 +300,4 @@ public void when_running_processor_with_limit_text_length_should_be_less_than_or
}
}

}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#Created by Apache Maven 3.1.1
#Wed Feb 21 16:18:53 EST 2018
version=1.0
#Created by Apache Maven 3.3.9
#Sun Mar 11 10:53:50 EDT 2018
version=1.5
groupId=com.dataflowdeveloper
artifactId=nifi-extracttext-processors
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
com/dataflowdeveloper/processors/process/ExtractTextProcessor$1.class
com/dataflowdeveloper/processors/process/ExtractTextProcessor.class
com/dataflowdeveloper/processors/process/ExtractTextProcessor$3.class
com/dataflowdeveloper/processors/process/ExtractTextProcessor$2.class
Original file line number Diff line number Diff line change
@@ -1 +1 @@
/Users/willy.demis/Documents/workspace/Nifi/nifi-extracttext-processor/nifi-extracttext-processors/src/main/java/com/dataflowdeveloper/processors/process/ExtractTextProcessor.java
/Volumes/seagate/projects/nifi-extracttext-processor/nifi-extracttext-processors/src/main/java/com/dataflowdeveloper/processors/process/ExtractTextProcessor.java
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
com/dataflowdeveloper/processors/process/ExtractTextProcessorTest$4.class
com/dataflowdeveloper/processors/process/ExtractTextProcessorTest$9.class
com/dataflowdeveloper/processors/process/ExtractTextProcessorTest$6.class
com/dataflowdeveloper/processors/process/ExtractTextProcessorTest$5.class
com/dataflowdeveloper/processors/process/ExtractTextProcessorTest.class
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
/Users/willy.demis/Documents/workspace/Nifi/nifi-extracttext-processor/nifi-extracttext-processors/src/test/java/com/dataflowdeveloper/processors/process/ExtractTextProcessorTest.java
/Volumes/seagate/projects/nifi-extracttext-processor/nifi-extracttext-processors/src/test/java/com/dataflowdeveloper/processors/process/ExtractTextProcessorTest.java
Original file line number Diff line number Diff line change
@@ -1 +1 @@
/Users/willy.demis/Documents/workspace/Nifi/nifi-extracttext-processor/nifi-extracttext-processors/src/test/java/com/dataflowdeveloper/processors/process/ExtractTextProcessorTest.java
/Volumes/seagate/projects/nifi-extracttext-processor/nifi-extracttext-processors/src/test/java/com/dataflowdeveloper/processors/process/ExtractTextProcessorTest.java
Loading

0 comments on commit 97759b9

Please sign in to comment.