Skip to content

Commit

Permalink
Add Got2Pee scene scraper (#2183)
Browse files Browse the repository at this point in the history
  • Loading branch information
Zheltyy authored Jan 30, 2025
1 parent 3808b13 commit 2fe6c7e
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 0 deletions.
17 changes: 17 additions & 0 deletions scrapers/Got2Pee/Got2Pee.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: Got2Pee

sceneByURL:
- action: script
url:
- got2pee.com/videos/
script:
- python
- got2pee_scraper.py
- scrape

driver:
useCDP: false
sleep: 2

debug:
printHTML: false
85 changes: 85 additions & 0 deletions scrapers/Got2Pee/got2pee_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import requests
from lxml import html
import json
from datetime import datetime

def get_page_content(url):
"""Fetch and parse HTML content from a given URL."""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
return html.fromstring(response.content)
except requests.exceptions.RequestException:
return None

def format_date(date_str):
"""Convert date from 'Oct 9, 2017' to 'YYYY-MM-DD' format."""
try:
return datetime.strptime(date_str, "%b %d, %Y").strftime("%Y-%m-%d")
except ValueError:
return None # Return None if the date can't be parsed

def scrape_video_data(main_url):
"""Extract title, image, details, tags, studio, and date for the given video URL."""
tree = get_page_content(main_url)
if tree is None:
return {}

# Extract details
title = tree.xpath("//h1/text()")
image = tree.xpath("//div[@class='video-trailer']//img/@src")

# 🔥 Fix: Ensure description is correctly extracted
details = tree.xpath("//div[@class='movie-description']/strong/following-sibling::text()")
details = " ".join([d.strip() for d in details if d.strip()]) # Clean up text and join if needed

# Extract tags
tags = tree.xpath("//span[@class='tags-list']//a/text()")
tags = [tag.strip("#") for tag in tags] # Remove hashtag from tags

# Extract and format date
raw_date = scrape_video_date(main_url)
formatted_date = format_date(raw_date) if raw_date else None

return {
"title": title[0] if title else None,
"image": image[0] if image else None,
"details": details if details else None, # 🔥 Now properly extracted
"tags": [{"Name": tag} for tag in tags],
"studio": {"Name": "Got2Pee"},
"date": formatted_date # Now in 'YYYY-MM-DD' format
}

def scrape_video_date(main_url):
"""Find the video URL in related videos and extract the corresponding date."""
tree = get_page_content(main_url)
if tree is None:
return None

related_video_urls = tree.xpath("/html/body/div[3]/div/div[2]/div[12]/section/div/div[1]/div/div/div[1]/a/@href")
related_video_urls = ["https://got2pee.com" + url if not url.startswith("http") else url for url in related_video_urls]

for related_video_url in related_video_urls:
related_tree = get_page_content(related_video_url)
if related_tree is None:
continue

video_links = related_tree.xpath("/html/body/div[3]/div/div[2]/div[12]/section/div/div[1]/div/div/div[1]/a/@href")
video_dates = related_tree.xpath("/html/body/div[3]/div/div[2]/div[12]/section/div/div[1]/div/div/div[2]/span[2]/text()")

for link, date in zip(video_links, video_dates):
full_link = "https://got2pee.com" + link if not link.startswith("http") else link
if full_link == main_url:
return date.strip()

return None

if __name__ == "__main__":
import sys
input_data = json.loads(sys.stdin.read())
video_url = input_data.get("url")
if video_url:
print(json.dumps(scrape_video_data(video_url), indent=4))

0 comments on commit 2fe6c7e

Please sign in to comment.