-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathripper.py
executable file
·140 lines (118 loc) · 4.75 KB
/
ripper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/python
# Mechanize
import mechanize
import html2text
import codecs
import re
import yaml
import os
from ntlm import HTTPNtlmAuthHandler
from bs4 import BeautifulSoup
def parse_page(url, output):
global handled, user, password, scrape_recursively, content_div_id, \
sharepoint_url, wiki_base_url, wiki_index, confluence_space_key, \
empties, has_images, direct_confluence_entry, add_legacy_link
passman = mechanize.HTTPPasswordMgrWithDefaultRealm()
passman.add_password(None, url, user, password)
# create the NTLM authentication handler
auth_NTLM = HTTPNtlmAuthHandler.HTTPNtlmAuthHandler(passman)
# create and install the opener
browser = mechanize.Browser()
browser.add_handler(auth_NTLM)
# Ignore robots.txt
browser.set_handle_robots(False)
# retrieve the result
response = browser.open(url)
handled.append(url)
soup = BeautifulSoup(response.read(), "html5lib")
innerDiv = soup.find("div", id=content_div_id)
if innerDiv != None:
# iterate over all relative links and update links
links = innerDiv.find_all("a", href=re.compile("^\/"))
for link in links:
# scrape link
if wiki_base_url in link['href']:
newoutput = link['href'].replace(wiki_base_url, "").replace("%20", "+").replace(".aspx", "")
fullLink = sharepoint_url + link['href']
if scrape_recursively and fullLink not in handled:
print fullLink
parse_page(fullLink, newoutput + ".md")
# fix links to point to Confluence-esque links
# remove aspx
link['href'] = link['href'].replace(".aspx", "");
# if copying directly into Confluence markdown entry, it needs to be different to using the SOAP converter
if direct_confluence_entry:
# replace Sharepoint url
link['href'] = link['href'].replace(wiki_base_url, "/display/" + confluence_space_key + "/");
# pluses rather than %20s
link['href'] = link['href'].replace("%20", "+");
else:
# remove Sharepoint url
link['href'] = link['href'].replace(wiki_base_url, "");
# spaces rather than %20s
link['href'] = link['href'].replace("%20", " ");
# remove aspx
link['href'] = link['href'].replace(".aspx", "");
# add Sharepoint url to any remaining sharepoint links
else:
link['href'] = sharepoint_url + link['href'];
# update relative image paths to point back to sharepoint
images = innerDiv.find_all("img", src=re.compile("^\/"))
if len(images) > 0:
has_images.append(url)
# it's not necessary for this to be within the if, but might as well
for image in images:
image['src'] = sharepoint_url + image['src']
# remove sharepoint layouts_data div content if exists (false,false,1 etc)
layoutsData = innerDiv.find("span", id="layoutsData")
if layoutsData != None:
layoutsData.string = ""
# convert to markdown and write
content = str(innerDiv).decode('utf-8').replace(u"\u200b", "")
h = html2text.HTML2Text()
h.body_width = 0
markdown = h.handle(content)
if add_legacy_link:
markdown += "\n\n---\n\nThis page has been automatically translated from SharePoint.\n\nSomething not look right? Try the [legacy SharePoint link](" + url + ")"
file = codecs.open("output/" + output, 'w', 'utf-8');
file.write(markdown);
file.close()
else:
if response.read() != '':
print 'Failed'
else:
print 'Empty'
empties.append(url)
return
config = yaml.load(file("config.yml"))
url = config['sharepoint_url'] + config['wiki_base_url'] + config['wiki_index']
# globals
user = config['username']
password = config['password']
scrape_recursively = config['scrape_recursively']
content_div_id = config['content_div_id']
sharepoint_url = config['sharepoint_url']
wiki_base_url = config['wiki_base_url']
wiki_index = config['wiki_index']
confluence_space_key = config['confluence_space_key']
direct_confluence_entry = config['direct_confluence_entry']
add_legacy_link = config['add_legacy_link']
handled = []
empties = []
has_images = []
# parse the index page
d = os.path.dirname('output/')
if not os.path.exists(d):
os.makedirs(d)
# kick it off
parse_page(url, 'index.md')
print "Empty pages"
print empties
with codecs.open('output/aaEmpty+Pages.md', 'w', 'utf-8') as empties_file:
for item in empties:
print>>empties_file, item
print "Pages with images pointing back to SharePoint"
print has_images
with codecs.open('output/aaPages+with+Images.md', 'w', 'utf-8') as images_file:
for item in has_images:
print>>images_file, item