-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMain.cs
75 lines (70 loc) · 2.17 KB
/
Main.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
using System;
using System.IO;
using System.Collections.Specialized;
using System.Text.RegularExpressions;
namespace WordCleaner
{
class MainClass
{
static void Main(string[] args)
{
if (args.Length == 0 || String.IsNullOrEmpty(args[0]))
{
Console.WriteLine("No filename provided.");
return;
}
string filepath = args[0];
if (Path.GetFileName(filepath) == args[0])
{
filepath = Path.Combine(Environment.CurrentDirectory, filepath);
}
if (!File.Exists(args[0]))
{
Console.WriteLine("File doesn't exist.");
}
string html = File.ReadAllText(filepath);
Console.WriteLine("input html is " + html.Length + " chars");
html = CleanWordHtml(html);
html = FixEntities(html);
filepath = Path.GetFileNameWithoutExtension(filepath) + ".modified.htm";
File.WriteAllText(filepath, html);
Console.WriteLine("cleaned html is " + html.Length + " chars");
}
static string CleanWordHtml(string html)
{
StringCollection sc = new StringCollection();
// get rid of unnecessary tag spans (comments and title)
sc.Add(@"<!--(\w|\W)+?-->");
sc.Add(@"<title>(\w|\W)+?</title>");
// Get rid of classes and styles
sc.Add(@"\s?class=\w+");
sc.Add(@"\s+style='[^']+'");
// Get rid of unnecessary tags
sc.Add(
@"<(meta|link|/?o:|/?style|/?div|/?st\d|/?head|/?html|body|/?body|/?span|!\[)[^>]*?>");
// Get rid of empty paragraph tags
sc.Add(@"(<[^>]+>)+ (</\w+>)+");
// remove bizarre v: element attached to <img> tag
sc.Add(@"\s+v:\w+=""[^""]+""");
// remove extra lines
sc.Add(@"(\n\r){2,}");
foreach (string s in sc)
{
html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase);
}
return html;
}
static string FixEntities(string html)
{
NameValueCollection nvc = new NameValueCollection();
nvc.Add("\"", "“");
nvc.Add("\"", "”");
nvc.Add("–", "—");
foreach (string key in nvc.Keys)
{
html = html.Replace(key, nvc[key]);
}
return html;
}
}
}