Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coomer update #2215

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 72 additions & 17 deletions scrapers/Coomer/Coomer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
'Referer': 'https://coomer.su/search_hash'
}

def extract_mentions_and_tags(text):
mentions = re.findall(r'@([\w\-._\d]+)', text) if text else []
hashtags = re.findall(r'#(\w+)\b', text) if text else []
return mentions, hashtags

def debugPrint(t):
sys.stderr.write(t + "\n")

Expand All @@ -23,7 +28,7 @@ def readJSONInput():
input = sys.stdin.read()
return json.loads(input)

def clean_text(details: str) -> str:
def clean_text(details: str) -> (str, str):
"""
remove escaped backslashes and html parse the details text
"""
Expand All @@ -33,15 +38,54 @@ def clean_text(details: str) -> str:
details) # bs.get_text doesnt replace br's with \n
details = re.sub(r'</?p>', '\n', details)
details = bs(details, features='html.parser').get_text()
# Remove leading/trailing/double whitespaces
details = '\n'.join(
[
' '.join([s for s in x.strip(' ').split(' ') if s != ''])
for x in ''.join(details).split('\n')
]
)
details = details.strip()
return details
lines = details.split('\n')
first_line = lines[0] if lines else ""

if len(first_line) > 100:
# Consider only the first 100 characters for truncation
first_100_chars = first_line[:100]
# Regular expression to match common emoji patterns
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"\U00002702-\U000027B0" # Dingbats
"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE
)
match = emoji_pattern.search(first_100_chars)
if match:
truncated_first_line = first_100_chars[:match.start()]
else:
dot_index = first_100_chars.find('.')
if dot_index != -1:
truncated_first_line = first_100_chars[:dot_index + 1]
else:
exclam_index = first_100_chars.find('!')
if exclam_index != -1:
truncated_first_line = first_100_chars[:exclam_index + 1]
else:
truncated_first_line = first_100_chars

rest_of_details = '\n'.join(lines[1:]).strip() if len(lines) > 1 else ""
rest_of_details = truncated_first_line + '\n' + rest_of_details
first_line = first_100_chars
else:
rest_of_details = '\n'.join(lines[1:]).strip() if len(lines) > 1 else ""

first_line = first_line.title()

return first_line, rest_of_details
return "", ""

def user_query (service, user):
if re.match('[0-9]*', user):
Expand All @@ -63,34 +107,45 @@ def post_query(service, user_id, id):
log.debug(data)
post = data['post']
user_name = user_query(service, user_id)
studio = {"Name": user_name}

if service == "onlyfans":
studio["URL"] = f"https://onlyfans.com/{user_name}"
studio = {"Name": f"{user_name} (OnlyFans)", "URL": f"https://onlyfans.com/{user_name}"}
elif service == "fansly":
studio["URL"] = f"https://fansly.com/{user_name}"
studio = {"Name": f"{user_name} (Fansly)", "URL": f"https://fansly.com/{user_name}"}
elif service == "candfans":
studio["URL"] = f"https://candfans.com/{user_name}"
studio = {"Name": f"{user_name} (CandFans)", "URL": f"https://candfans.com/{user_name}"}
else:
studio = {"Name": user_name}
debugPrint("No service listed")

tags = []
mentions, hashtags = extract_mentions_and_tags(post.get('content', ''))

unique_performers = {user_name}
unique_performers.update(mentions)

if post['tags'] is not None:
tags = [{"name": item } for item in post['tags']]

out = {"Title": post['title'],
"Date": post['published'][:10],
"URL": f"https://coomer.su/{post['service']}/user/{post['user']}/post/{post['id']}",
"Details": clean_text(post['content']),
"Studio": studio,
"Performers": [{"Name": user_name, "urls": [studio['URL']]}],
"Tags": tags
tags = [{"name": item} for item in post['tags']]
else:
tags = [{"name": tag} for tag in hashtags]

first_line, rest_of_details = clean_text(post['content'])

out = {
"Title": first_line,
"Date": post['published'].split('T')[0],
"URL": f"https://coomer.su/{post['service']}/user/{post['user']}/post/{post['id']}",
"Details": rest_of_details,
"Studio": studio,
"Performers": [{"Name": name, "urls": [studio['URL']]} for name in unique_performers],
"Tags": tags,
}

log.debug(out)
return out
else:
debugPrint(f'Response: {str(post_lookup_response.status_code)} \n Text: {str(post_lookup_response.text)}')


def get_scene(inputurl):
match = re.search(r'/(\w+?)/user/(.+?)/post/(\d+)', inputurl)
if match:
Expand Down