-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathWordFilter.cfc
78 lines (70 loc) · 2.87 KB
/
WordFilter.cfc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
<cfcomponent displayname="MS Word Filter" output="false" hint="I strip out the junk MS Word puts into HTML.">
<cffunction name="init" access="public" returntype="any" output="no" hint="I instantiate and return this component.">
<cfreturn This>
</cffunction>
<cffunction name="filter" access="public" returntype="struct" output="no" hint="I run the filter on the given structure and return it.">
<cfargument name="data" type="struct" required="yes">
<cfargument name="maxpoints" type="numeric" default="0">
<cfset var field = "">
<cfloop collection="#arguments.data#" item="field">
<!--- This means we'll test for MS Word text twice, but that will save an extra variable assignment. --->
<cfif isMSWordText(Arguments.data[field])>
<cfset Arguments.data[field] = cleanWord(Arguments.data[field])>
</cfif>
</cfloop>
<cfreturn Arguments.data>
</cffunction>
<cfscript>
function isMSWordText(string) {
//If it isn't a string, this isn't MS Word text.
if ( NOT isSimpleValue(Arguments.string) ) {
return false;
}
//If the string isn't at least 200 characters, it isn't from MS Word.
if ( Len(Trim(Arguments.string)) LT 200 ) {
return false;
}
return ( FindNoCase('MsoNormal',string) GT 0 );
}
function cleanWord(string) {
var result = string;//Just in case we want access to the original string later.
if ( isMSWordText(string) ) {
result = cleanWordStyleBlock(result);
result = cleanStyleAtts(result);
result = cleanSpans(result);
result = cleanEmptyLines(result);
}
return result;
}
function cleanWordStyleBlock(string) {
var result = string;//Just in case we want access to the original string later.
var styleOpen = FindNoCase('<style type="text/css"><!--',result);
var styleClose = FindNoCase('</style>',result,styleOpen+1) + Len('</style>');
var styleBlock = '';
//If we find an MS Word style block, ditch it.
if ( styleOpen AND styleClose ) {
styleClose = styleClose-5;//Not sure why this is needed. Need to figure that out.
styleBlock = Mid(string,styleOpen,styleClose);
result = ReplaceNoCase(result,styleBlock,'');
}
return result;
}
function cleanStyleAtts(string) {
var result = string;//Just in case we want access to the original string later.
result = REReplaceNoCase(result,' style=".*?"','','ALL');
return result;
}
function cleanSpans(string) {
var result = string;//Just in case we want access to the original string later.
result = REReplaceNoCase(result,'</?span>','','ALL');
return result;
}
function cleanEmptyLines(string) {
var result = string;//Just in case we want access to the original string later.
result = ReplaceNoCase(result,' ','','ALL');//The non-breaking spaces Word adds aren't needed.
result = REReplaceNoCase(result,'<br ?/?>\s*</p>','</p>','ALL');//No need for carriage return at end of paragraph.
result = ReplaceNoCase(result,'<p></p>','','ALL');//Ditch now-empty paragraphs
return result;
}
</cfscript>
</cfcomponent>