Skip to content

Commit f3ee083

Browse files
committed
Replaced the node algorithm for the python call
1 parent 39fec49 commit f3ee083

File tree

4 files changed

+70
-100
lines changed

4 files changed

+70
-100
lines changed

python/scrapper.py

+43-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,44 @@
1-
import sys
2-
print "4Chan Scrapper"
1+
from bs4 import BeautifulSoup
2+
import requests, shutil, string
3+
import os, sys
34

4-
for arg in sys.argv:
5-
print arg
5+
def create_dirs(url):
6+
urlData = url.split('/');
7+
boardPath = './files/' + urlData[-4]
8+
threadPath = boardPath + '/' + urlData[-1];
9+
10+
if not os.path.exists('./files'):
11+
os.makedirs('./files')
12+
if not os.path.exists(boardPath):
13+
os.makedirs(boardPath)
14+
if not os.path.exists(threadPath):
15+
os.makedirs(threadPath)
16+
return threadPath
17+
18+
def download_resource(url, dirname):
19+
filename = url.split('/')[-1]
20+
filePath = dirname + '/' + filename
21+
if not os.path.exists(filePath):
22+
response = requests.get(url, stream = True)
23+
with open(filePath, 'wb') as out_file:
24+
shutil.copyfileobj(response.raw, out_file)
25+
del response
26+
#print('Downloading ' + url + '...')
27+
return 1
28+
else:
29+
return 0
30+
31+
32+
url = sys.argv[1] # We take the URL from the command line
33+
r = requests.get(url)
34+
data = r.text
35+
soup = BeautifulSoup(data, 'html.parser')
36+
dirname = create_dirs(url)
37+
resources = 0
38+
39+
links = soup.find_all('a', class_='fileThumb')
40+
print links.__len__(), ' Resources Going To Be Downloaded.'
41+
for link in links:
42+
resources = resources + download_resource('http:' + link.get('href'), dirname)
43+
44+
print resources, ' Resources Downloaded.'

routes.js

+5-5
Original file line numberDiff line numberDiff line change
@@ -26,24 +26,24 @@ module.exports = function (app, fourChanService) {
2626
var surl = req.params.surl;
2727
fourChanService.getThread(section, id, surl, function (data) {
2828

29-
res.render('thread.jade', { threadName : surl, resources : data });
29+
res.render('thread.jade', { threadName : surl, pythonOutput : data });
3030
});
3131
});
3232

3333
app.get('/test', function (req, res) {
3434

3535
var spawn = require('child_process').spawn,
36-
ls = spawn('python', ['./python/scrapper.py', 'cadena', 1, 'otraCadena']);
36+
command = spawn('python', ['./python/scrapper.py', [1,2,3], 'http://algo.com']);
3737

38-
ls.stdout.on('data', function (data) {
38+
command.stdout.on('data', function (data) {
3939
console.log('stdout: ' + data);
4040
});
4141

42-
ls.stderr.on('data', function (data) {
42+
command.stderr.on('data', function (data) {
4343
console.log('stderr: ' + data);
4444
});
4545

46-
ls.on('close', function (code) {
46+
command.on('close', function (code) {
4747
console.log('child process exited with code ' + code);
4848
});
4949

services/fourChanService.js

+20-88
Original file line numberDiff line numberDiff line change
@@ -70,99 +70,31 @@ var fourChanService = {
7070

7171
getThread : function(boardName, id, surl, callback) {
7272

73-
var self = this; // Saving the reference to the main object
74-
75-
// Proccess to scrap
76-
var osmosis = require('osmosis');
73+
// URL of the thread to download
7774
var url = 'http://boards.4chan.org/' + boardName + '/thread/' + id + '/' + surl;
78-
osmosis.get(url)
79-
.set({
80-
'resources': ['a.fileThumb @href']
81-
})
82-
.data(function (results) {
75+
// We prepare us to call python
76+
var spawn = require('child_process').spawn;
77+
var command = spawn('python', ['./python/scrapper.py', url]);
78+
var output = '';
79+
80+
//Listening for the python information
81+
command.stdout.on('data', function (data) {
82+
console.log(data);
83+
output += data;
84+
});
8385

84-
var resources = results.resources;
85-
console.log('There going to be downloaded ' + resources.length + ' resources.' );
86-
for(var cont = 0; cont < resources.length && cont < 50; cont++)
87-
{
88-
var url = 'http:' + resources[cont];
89-
90-
// We set the name for file
91-
var filename = resources[cont].split('/');
92-
filename = filename[filename.length - 1];
93-
94-
// We set the full path '.files/boardName/semanticURL/filename'
95-
var path = './files/' + boardName + '/' + surl + '/' +filename;
96-
97-
// We download the file
98-
var fileCounter = cont + 1;
99-
self.dowloadResource(url, boardName, surl, path, function () {
100-
console.log(fileCounter + '::File ' + path + ' downloaded');
101-
});
102-
}
103-
104-
setTimeout(function () { console.log('Second batch...') }, 2000);
105-
for(; cont < resources.length && cont < 100; cont++)
106-
{
107-
url = 'http:' + resources[cont];
108-
109-
// We set the name for file
110-
filename = resources[cont].split('/');
111-
filename = filename[filename.length - 1];
112-
113-
// We set the full path '.files/boardName/semanticURL/filename'
114-
path = './files/' + boardName + '/' + surl + '/' +filename;
115-
116-
// We download the file
117-
fileCounter = cont + 1;
118-
self.dowloadResource(url, boardName, surl, path, function () {
119-
console.log(fileCounter + '::File ' + path + ' downloaded');
120-
});
121-
}
122-
123-
setTimeout(function () { console.log('Third batch...') }, 2000);
124-
for(; cont < resources.length && cont < 150; cont++)
125-
{
126-
url = 'http:' + resources[cont];
127-
128-
// We set the name for file
129-
filename = resources[cont].split('/');
130-
filename = filename[filename.length - 1];
131-
132-
// We set the full path '.files/boardName/semanticURL/filename'
133-
path = './files/' + boardName + '/' + surl + '/' +filename;
134-
135-
// We download the file
136-
fileCounter = cont + 1;
137-
self.dowloadResource(url, boardName, surl, path, function () {
138-
console.log(fileCounter + '::File ' + path + ' downloaded');
139-
});
140-
}
141-
142-
setTimeout(function () { console.log('Forth batch...') }, 2000);
143-
for(; cont < resources.length && cont < 200; cont++)
144-
{
145-
url = 'http:' + resources[cont];
146-
147-
// We set the name for file
148-
filename = resources[cont].split('/');
149-
filename = filename[filename.length - 1];
150-
151-
// We set the full path '.files/boardName/semanticURL/filename'
152-
path = './files/' + boardName + '/' + surl + '/' +filename;
153-
154-
// We download the file
155-
fileCounter = cont + 1;
156-
self.dowloadResource(url, boardName, surl, path, function () {
157-
console.log(fileCounter + '::File ' + path + ' downloaded');
158-
});
159-
}
160-
// After all, we call the callback
161-
callback(resources);
86+
command.stderr.on('data', function (data) {
87+
console.log('stderr: ' + data);
88+
output += data;
89+
});
90+
91+
command.on('close', function (code) {
92+
console.log('Child process exited with code ' + code);
16293
});
16394

164-
16595

96+
// After all, we call the callback
97+
callback(output);
16698
},
16799

168100
dowloadResource: function (uri, boardName, surl, filename, callback) {

views/thread.jade

+2-3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,5 @@ block content
44
div.row
55
div.col-md-12
66
h2 #{threadName}
7-
ul
8-
each resource in resources
9-
li #{resource}
7+
p #{pythonOutput}
8+

0 commit comments

Comments
 (0)