Skip to content

Commit

Permalink
changed file structure and added dynamic input data
Browse files Browse the repository at this point in the history
Changed the structure to be a bit more organized and updated all files to run from main.py. Also made it so everytime the grabber runs it grabs the new inputdata.txt file from github to elimate updating the entire docker file everytime I need to add a link. Also allows for easier pull requests to be made to the input data file.
  • Loading branch information
Wamy-Dev committed Apr 25, 2022
1 parent 13e98fb commit 41fe5c6
Show file tree
Hide file tree
Showing 11 changed files with 104 additions and 152 deletions.
3 changes: 3 additions & 0 deletions components/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELENIUMCLIENT=yourseleniumclient
SEARCHCLIENT=yourmeilisearchclient
SEARCHAPIKEY=yourmeilisearchclientapikey
4 changes: 2 additions & 2 deletions cleaner.py → components/cleaner.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
#loads json file
try:
json_file = open("output.json")
json_file = open("./components/output.json")
strings = json.load(json_file)
except:
print('Json file to be cleaned not found.')
Expand All @@ -21,7 +21,7 @@
targets = {'marked':['https://masquerade.site#a-z-listing-1'],'marked':['https://nsw2u.xyz/#a-z-listing-2'],'marked':['https://madloader.com/request/'],'marked':['https://nxbrew.com/#a-z-listing-1'],'marked':['https://archive.org/download/mame-merged/mame-merged/../']}#add pairs here, or structure differently up to you format: 'key2':['target1','target2']
for target_key,target_strings in targets.items():
strings[target_key] = [s for s in strings[target_key] if s not in target_strings]
with open("outputcleaned.json", "w", encoding="utf-8") as file:#dumps to new json file to be used in totable.py
with open("./components/outputcleaned.json", "w", encoding="utf-8") as file:#dumps to new json file to be used in totable.py
file.write(
json.dumps(strings)
)
Expand Down
112 changes: 56 additions & 56 deletions forsearch.py → components/forsearch.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,57 @@
import json
import random
import string
def listToString(s):
# initialize an empty string
str1 = ""
count=0
# traverse in the string
for ele in s:
str1 += ele
count+=1
if count==len(s):
continue
else:
str1+=" "
# return string
return str1
input_file = 'outputcleaned.json' # input file
output_file = 'outputsearchready.json'
# Opening JSON file
f = open(input_file)
N=10 # ID length
count= 0
dic= {} # to store overal output
data = json.load(f)
for k in data.keys():
key = k
lst= []
count+=1
for sub_k in data[k]: # access each entry
ID= ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(N)) # generating IDs of N length
j = sub_k.split("/")[-1]
if j=='':
j= sub_k.split("/")[-2]
name=None
if "-" in j:
name = j.split("-")
else:
name= j.split("_")
name = [nam.title() for nam in name ]
name = listToString(name)
# print(sub_k.replace("https://",""), j, name)
lst.append({"id":ID, "basename":name.replace("%",""),"link":sub_k.replace("https://","")})
dic[key] = lst
# comment next two lines, if you want output for all objects. This 'IF' is just for three objects to check output
#if count==3:
# break
with open(output_file, "w") as outfile:
json.dump(dic, outfile)
json_file = open("outputsearchready.json")
file = json.load(json_file)
data = file.pop('marked')
with open("outputsearchready.json", "w", encoding="utf-8") as file:#dumps to new json file to be used in totable.py
file.write(
json.dumps(data)
)
import json
import random
import string
def listToString(s):
# initialize an empty string
str1 = ""
count=0
# traverse in the string
for ele in s:
str1 += ele
count+=1
if count==len(s):
continue
else:
str1+=" "
# return string
return str1
input_file = './components/outputcleaned.json' # input file
output_file = './components/outputsearchready.json'
# Opening JSON file
f = open(input_file)
N=10 # ID length
count= 0
dic= {} # to store overal output
data = json.load(f)
for k in data.keys():
key = k
lst= []
count+=1
for sub_k in data[k]: # access each entry
ID= ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(N)) # generating IDs of N length
j = sub_k.split("/")[-1]
if j=='':
j= sub_k.split("/")[-2]
name=None
if "-" in j:
name = j.split("-")
else:
name= j.split("_")
name = [nam.title() for nam in name ]
name = listToString(name)
# print(sub_k.replace("https://",""), j, name)
lst.append({"id":ID, "basename":name.replace("%",""),"link":sub_k.replace("https://","")})
dic[key] = lst
# comment next two lines, if you want output for all objects. This 'IF' is just for three objects to check output
#if count==3:
# break
with open(output_file, "w") as outfile:
json.dump(dic, outfile)
json_file = open("./components/outputsearchready.json")
file = json.load(json_file)
data = file.pop('marked')
with open("./components/outputsearchready.json", "w", encoding="utf-8") as file:#dumps to new json file to be used in totable.py
file.write(
json.dumps(data)
)
import sendtosearch
22 changes: 16 additions & 6 deletions grabber.py → components/grabber.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,30 @@
import time
import decouple
from decouple import config

import requests
from os import getcwd
#
SELENIUMCLIENT = config('SELENIUMCLIENT')
#starting timer
print('starting process')
#getting updated input file
url = "https://raw.githubusercontent.com/Wamy-Dev/ReziWebsite/main/Input%20Data.txt"
directory = getcwd()
r = requests.get(url)
data = open("./components/Input Data.txt", "wb")
data.write(r.content)
data.close()
#setting up chrome settings
uc = webdriver
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless') #remove hashtag at the start to run in headless mode, must also remove extension for this to work, not recommended
chrome_options.add_extension('extension_1_38_0_0.crx')
chrome_options.add_extension('./resources/ublockorigin.crx')
chrome_options.add_extension('./resources/popupblockerpro.crx')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36")
#wd = uc.Chrome(executable_path='chromedriver',options=chrome_options) #if local
wd = uc.Remote(SELENIUMCLIENT, options=chrome_options) #if for remote
wd = uc.Chrome(executable_path='./resources/chromedriver',options=chrome_options) #if local
#wd = uc.Remote(SELENIUMCLIENT, options=chrome_options) #if for remote
json_data={}
#getting the links and setting up json
def link_container(site_name,container_tag,class_tag,html,domain):
Expand Down Expand Up @@ -68,7 +78,7 @@ def return_next_ele(html,check_element,next_page):
# next=wd.find_element(By.XPATH,'//a[@class="lcp_nextlink"]')
return next
#getting data from input file
input_file=open('Input Data.txt','r')
input_file=open('./components/Input Data.txt','r')
name=input_file.readline().replace("\n","")
json_data[name]=[]
while (True):
Expand Down Expand Up @@ -104,7 +114,7 @@ def return_next_ele(html,check_element,next_page):
if not name: break
input_file.close()
#outputting the data to Output.json
output_file=open("output.json","w")
output_file=open("./components/output.json","w")
json_string=json.dumps(json_data)
output_file.write(json_string)
output_file.close()
Expand Down
16 changes: 16 additions & 0 deletions components/sendtosearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import meilisearch
import json
import os
import decouple
from decouple import config
#
SEARCHCLIENT = config('SEARCHCLIENT')
SEARCHAPIKEY = config('SEARCHAPIKEY')
#
#client = meilisearch.Client('serverlocation', 'apikey')
client = meilisearch.Client(SEARCHCLIENT, SEARCHAPIKEY)
json_file = open('./components/outputsearchready.json')
games = json.load(json_file)
client.delete_index('games') #deletes previous index due to the way meilisearch does indexes, it adds on top of, and updating doesn't work very well, so a good ole delete and create works fine.
client.index('games').add_documents(games)
print('finished entire process.')
Binary file removed extension_1_38_0_0.crx
Binary file not shown.
10 changes: 10 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#Welcome to Rezi!
# ____ _
# / __ \___ ____ (_)
# / /_/ / _ \/_ / / /
# / _, _/ __/ / /_/ /
#/_/ |_|\___/ /___/_/
#Rezi was written in Python 3.9.6 on Selenium.
import sys
sys.path.append('./components')
import grabber
7 changes: 1 addition & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
beautifulsoup4 == 4.10.0
selenium == 4.1.0
undetected-chromedriver == 3.0.6
requests-html == 0.10.0
output == 1.0.1
json2table
pysftp
requests
output
meilisearch
pysftp
python-decouple
Binary file added resources/popupblockerpro.crx
Binary file not shown.
Binary file added resources/ublockorigin.crx
Binary file not shown.
82 changes: 0 additions & 82 deletions sendtosearch.py

This file was deleted.

0 comments on commit 41fe5c6

Please sign in to comment.