changed file structure and added dynamic input data

Changed the structure to be a bit more organized and updated all files to run from main.py. Also made it so everytime the grabber runs it grabs the new inputdata.txt file from github to elimate updating the entire docker file everytime I need to add a link. Also allows for easier pull requests to be made to the input data file.
Wamy-Dev · Apr 25, 2022 · 41fe5c6 · 41fe5c6
1 parent 13e98fb
commit 41fe5c6
Show file tree

Hide file tree

Showing 11 changed files with 104 additions and 152 deletions.
diff --git a/components/.env b/components/.env
@@ -0,0 +1,3 @@
+SELENIUMCLIENT=yourseleniumclient
+SEARCHCLIENT=yourmeilisearchclient
+SEARCHAPIKEY=yourmeilisearchclientapikey
diff --git a/cleaner.py → components/cleaner.py b/cleaner.py → components/cleaner.py
@@ -1,7 +1,7 @@
 import json
 #loads json file
 try:
-  json_file = open("output.json")
+  json_file = open("./components/output.json")
   strings = json.load(json_file)
 except:
   print('Json file to be cleaned not found.')
@@ -21,7 +21,7 @@
   targets = {'marked':['https://masquerade.site#a-z-listing-1'],'marked':['https://nsw2u.xyz/#a-z-listing-2'],'marked':['https://madloader.com/request/'],'marked':['https://nxbrew.com/#a-z-listing-1'],'marked':['https://archive.org/download/mame-merged/mame-merged/../']}#add pairs here, or structure differently up to you format: 'key2':['target1','target2']
   for target_key,target_strings in targets.items():
       strings[target_key] = [s for s in strings[target_key] if s not in target_strings]
-  with open("outputcleaned.json", "w", encoding="utf-8") as file:#dumps to new json file to be used in totable.py
+  with open("./components/outputcleaned.json", "w", encoding="utf-8") as file:#dumps to new json file to be used in totable.py
     file.write(
     json.dumps(strings)
     )

diff --git a/forsearch.py → components/forsearch.py b/forsearch.py → components/forsearch.py
@@ -1,57 +1,57 @@
-import json
-import random
-import string
-def listToString(s): 
-    # initialize an empty string
-    str1 = "" 
-    count=0
-    # traverse in the string  
-    for ele in s: 
-        str1 += ele
-        count+=1
-        if count==len(s):
-          continue
-        else:
-          str1+=" "  
-    # return string  
-    return str1 
-input_file =  'outputcleaned.json'  # input file 
-output_file = 'outputsearchready.json'
-# Opening JSON file
-f = open(input_file)
-N=10  # ID length 
-count= 0 
-dic= {}  # to store overal output
-data = json.load(f) 
-for k in data.keys():
-  key = k
-  lst= []
-  count+=1
-  for sub_k in data[k]: #  access each entry 
-    ID= ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(N)) # generating IDs of N length
-    j = sub_k.split("/")[-1]
-    if j=='':
-      j= sub_k.split("/")[-2]
-    name=None
-    if "-" in j:
-      name = j.split("-")
-    else:
-      name= j.split("_")
-    name = [nam.title() for nam in name ]
-    name = listToString(name)
-    # print(sub_k.replace("https://",""), j, name)
-    lst.append({"id":ID, "basename":name.replace("%",""),"link":sub_k.replace("https://","")})
-  dic[key] = lst
-  # comment next two lines, if you want output for all objects. This 'IF' is just for three objects to check output
-  #if count==3:
-  #  break
-with open(output_file, "w") as outfile:
-    json.dump(dic, outfile)
-json_file = open("outputsearchready.json")
-file = json.load(json_file)
-data = file.pop('marked')
-with open("outputsearchready.json", "w", encoding="utf-8") as file:#dumps to new json file to be used in totable.py
-  file.write(
-  json.dumps(data)
-  )
+import json
+import random
+import string
+def listToString(s): 
+    # initialize an empty string
+    str1 = "" 
+    count=0
+    # traverse in the string  
+    for ele in s: 
+        str1 += ele
+        count+=1
+        if count==len(s):
+          continue
+        else:
+          str1+=" "  
+    # return string  
+    return str1 
+input_file =  './components/outputcleaned.json'  # input file 
+output_file = './components/outputsearchready.json'
+# Opening JSON file
+f = open(input_file)
+N=10  # ID length 
+count= 0 
+dic= {}  # to store overal output
+data = json.load(f) 
+for k in data.keys():
+  key = k
+  lst= []
+  count+=1
+  for sub_k in data[k]: #  access each entry 
+    ID= ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(N)) # generating IDs of N length
+    j = sub_k.split("/")[-1]
+    if j=='':
+      j= sub_k.split("/")[-2]
+    name=None
+    if "-" in j:
+      name = j.split("-")
+    else:
+      name= j.split("_")
+    name = [nam.title() for nam in name ]
+    name = listToString(name)
+    # print(sub_k.replace("https://",""), j, name)
+    lst.append({"id":ID, "basename":name.replace("%",""),"link":sub_k.replace("https://","")})
+  dic[key] = lst
+  # comment next two lines, if you want output for all objects. This 'IF' is just for three objects to check output
+  #if count==3:
+  #  break
+with open(output_file, "w") as outfile:
+    json.dump(dic, outfile)
+json_file = open("./components/outputsearchready.json")
+file = json.load(json_file)
+data = file.pop('marked')
+with open("./components/outputsearchready.json", "w", encoding="utf-8") as file:#dumps to new json file to be used in totable.py
+  file.write(
+  json.dumps(data)
+  )
 import sendtosearch
diff --git a/grabber.py → components/grabber.py b/grabber.py → components/grabber.py
@@ -12,20 +12,30 @@
 import time
 import decouple
 from decouple import config
-
+import requests
+from os import getcwd
+#
 SELENIUMCLIENT = config('SELENIUMCLIENT')
 #starting timer
 print('starting process')
+#getting updated input file
+url = "https://raw.githubusercontent.com/Wamy-Dev/ReziWebsite/main/Input%20Data.txt"
+directory = getcwd()
+r = requests.get(url)
+data = open("./components/Input Data.txt", "wb")
+data.write(r.content)
+data.close()
 #setting up chrome settings
 uc = webdriver
 chrome_options = webdriver.ChromeOptions()
 #chrome_options.add_argument('--headless') #remove hashtag at the start to run in headless mode, must also remove extension for this to work, not recommended
-chrome_options.add_extension('extension_1_38_0_0.crx')
+chrome_options.add_extension('./resources/ublockorigin.crx')
+chrome_options.add_extension('./resources/popupblockerpro.crx')
 chrome_options.add_argument('--no-sandbox') 
 chrome_options.add_argument('--disable-dev-shm-usage')
 chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36")
-#wd = uc.Chrome(executable_path='chromedriver',options=chrome_options) #if local
-wd = uc.Remote(SELENIUMCLIENT, options=chrome_options) #if for remote
+wd = uc.Chrome(executable_path='./resources/chromedriver',options=chrome_options) #if local
+#wd = uc.Remote(SELENIUMCLIENT, options=chrome_options) #if for remote
 json_data={}
 #getting the links and setting up json
 def link_container(site_name,container_tag,class_tag,html,domain):
@@ -68,7 +78,7 @@ def return_next_ele(html,check_element,next_page):
     # next=wd.find_element(By.XPATH,'//a[@class="lcp_nextlink"]')
     return next
 #getting data from input file
-input_file=open('Input Data.txt','r')
+input_file=open('./components/Input Data.txt','r')
 name=input_file.readline().replace("\n","")
 json_data[name]=[]
 while (True):
@@ -104,7 +114,7 @@ def return_next_ele(html,check_element,next_page):
     if not name: break
 input_file.close()
 #outputting the data to Output.json
-output_file=open("output.json","w")
+output_file=open("./components/output.json","w")
 json_string=json.dumps(json_data)
 output_file.write(json_string)
 output_file.close()

diff --git a/components/sendtosearch.py b/components/sendtosearch.py
@@ -0,0 +1,16 @@
+import meilisearch
+import json
+import os
+import decouple
+from decouple import config
+#
+SEARCHCLIENT = config('SEARCHCLIENT')
+SEARCHAPIKEY = config('SEARCHAPIKEY')
+#
+#client = meilisearch.Client('serverlocation', 'apikey')
+client = meilisearch.Client(SEARCHCLIENT, SEARCHAPIKEY)
+json_file = open('./components/outputsearchready.json')
+games = json.load(json_file)
+client.delete_index('games') #deletes previous index due to the way meilisearch does indexes, it adds on top of, and updating doesn't work very well, so a good ole delete and create works fine.
+client.index('games').add_documents(games)
+print('finished entire process.')
diff --git a/extension_1_38_0_0.crx b/extension_1_38_0_0.crx
diff --git a/main.py b/main.py
@@ -0,0 +1,10 @@
+#Welcome to Rezi!
+#    ____             _ 
+#   / __ \___  ____  (_)
+#  / /_/ / _ \/_  / / / 
+# / _, _/  __/ / /_/ /  
+#/_/ |_|\___/ /___/_/   
+#Rezi was written in Python 3.9.6 on Selenium.                                       
+import sys
+sys.path.append('./components')
+import grabber
diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,6 @@
 beautifulsoup4 == 4.10.0
 selenium == 4.1.0
 undetected-chromedriver == 3.0.6
-requests-html == 0.10.0
-output == 1.0.1
-json2table
-pysftp
-requests
+output
 meilisearch
-pysftp
 python-decouple
diff --git a/resources/popupblockerpro.crx b/resources/popupblockerpro.crx
diff --git a/resources/ublockorigin.crx b/resources/ublockorigin.crx
diff --git a/sendtosearch.py b/sendtosearch.py