-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinkfetcher2.py
144 lines (116 loc) · 6.22 KB
/
linkfetcher2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import xml.etree.ElementTree as ET
import requests
import re
from urllib.parse import quote
from fuzzywuzzy import fuzz, process
from urllib.parse import unquote
def fetch_and_parse_xml(url):
try:
response = requests.get(url)
response.raise_for_status()
return ET.fromstring(response.content)
except requests.RequestException as e:
print(f"Error fetching data from {url}: {e}")
return None
def preprocess_for_fuzzy(text):
text = re.sub(r'[^\w\s]', '', text.lower())
return text.strip()
def is_excluded(file_name):
"""Check if the file name contains excluded terms within brackets."""
excluded_terms = ['proto', 'beta', 'demo', 'sample', 'promo']
pattern = r'\((.*?)\)' # Matches any content within brackets
matches = re.findall(pattern, file_name, re.IGNORECASE)
for match in matches:
for term in excluded_terms:
if term in match.lower(): # Checks if the term is in the matched bracket content
return True
return False
def extract_numbers(text):
"""Extracts all numbers from a given text."""
return re.findall(r'\d+', text)
def select_most_similar_file(game_title, files):
preprocessed_title = preprocess_for_fuzzy(game_title)
title_numbers = extract_numbers(game_title)
filtered_files = [f for f in files if not is_excluded(f)]
best_match = None
best_score = -1 # Initialize with an invalid score
for file in filtered_files:
# Process filename to consider text up to the first "("
file_base = file.split("(")[0].strip()
file_numbers = extract_numbers(file_base)
# Rule 1 & 2: Skip files where numeric parts don't match or don't align with the title's numbers
if title_numbers != file_numbers:
continue
# Perform fuzzy matching on processed strings
preprocessed_file = preprocess_for_fuzzy(file_base)
score = fuzz.token_sort_ratio(preprocessed_title, preprocessed_file)
# Update best match if this file has a higher score than current best
if score > best_score:
best_match = file
best_score = score
if best_match and best_score >= 70: # Consider matches above a certain threshold
print(f"Matched '{game_title}' to '{best_match}' with a score of {best_score}")
return best_match, best_score
else:
print(f"No match found for '{game_title}'")
return None, None
def generate_download_links(game_titles, xml_data_urls, input_filename):
all_files = []
file_sources = {} # Dictionary to map file names to their source URLs
for url in xml_data_urls:
root = fetch_and_parse_xml(url)
if root:
files_in_current_xml = [file.get('name') for file in root.findall('file')]
all_files.extend(files_in_current_xml)
for file_name in files_in_current_xml:
# Extract part after '/items/' and before the next slash if present
source_identifier = url.split("/items/")[-1].split('/')[0]
file_sources[file_name] = source_identifier
all_files_sorted = sorted([f for f in all_files if not is_excluded(f)], key=lambda x: x.lower())
matched_games = {} # Store matched games as {game_title: download_link}
unmatched_titles = [] # List to store unmatched game titles
for game_title in game_titles:
selected_file, score = select_most_similar_file(game_title, all_files_sorted)
if selected_file:
source_identifier = file_sources[selected_file]
download_link = f"https://archive.org/download/{source_identifier}/{quote(selected_file)}"
matched_games[game_title] = download_link
else:
unmatched_titles.append(game_title)
# Sort matched games alphabetically by game title
sorted_matched_games = {k: matched_games[k] for k in sorted(matched_games)}
# Create log filename based on input filename with "_log" appended
log_filename = f"{input_filename.rsplit('.', 1)[0]}_log.txt"
# Write matched games (without download links) and unmatched titles to log file, sorted alphabetically
with open(log_filename, "w") as log_file:
log_file.write("Matched Titles:\n")
for title in sorted_matched_games.keys():
download_link = matched_games[title]
filename = unquote(download_link.split('/')[-1]) # Decode URL encoding and extract filename
log_file.write(f"{title}\n{filename}\n\n")
if unmatched_titles:
log_file.write("\nUnmatched Titles:\n")
for title in sorted(unmatched_titles):
log_file.write(f"{title}\n\n")
# Create a separate file for download links
links_filename = f"{input_filename.rsplit('.', 1)[0]}_links.txt"
with open(links_filename, "w") as links_file:
for title, download_link in sorted_matched_games.items():
links_file.write(f"Title: {title}\nDownload Link: {download_link}\n\n")
print("\nUnmatched Titles:")
for title in sorted(unmatched_titles):
print(title)
return unmatched_titles, sorted_matched_games
if __name__ == "__main__":
input_filename = "greatest_ps2.txt"
with open(input_filename, "r") as file:
game_titles = [line.strip() for line in file.readlines()]
game_titles = [title for title in game_titles if not title.strip().startswith('#')]
xml_data_urls = [
"https://ia800505.us.archive.org/7/items/redumpSonyPlaystation2UsaGames2018Aug01/redumpSonyPlaystation2UsaGames2018Aug01_files.xml",
"https://ia904703.us.archive.org/32/items/redumpSonyPlaystation2UsaGames2018Aug01Part2/redumpSonyPlaystation2UsaGames2018Aug01Part2_files.xml",
"https://ia801005.us.archive.org/20/items/redumpSonyPlaystation2UsaGames2018Aug01Part3/redumpSonyPlaystation2UsaGames2018Aug01Part3_files.xml",
"https://ia803004.us.archive.org/19/items/redumpSonyPlaystation2UsaGames2018Aug01Part4/redumpSonyPlaystation2UsaGames2018Aug01Part4_files.xml",
"https://ia803009.us.archive.org/4/items/redumpSonyPlaystation2UsaOther2018Aug01/redumpSonyPlaystation2UsaOther2018Aug01_files.xml",
]
unmatched_titles, matched_games = generate_download_links(game_titles, xml_data_urls, input_filename)