Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/on_demand.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ You can:

1. **Simple Integration**: Import the enhanced scraper and provider:
```python
from scraping.x.enhanced_apidojo_scraper import EnhancedApiDojoTwitterScraper
from scraping.x.on_demand_model import EnhancedXContent
from scraping.x.apidojo_scraper import ApiDojoTwitterScraper
from scraping.x.model import XContent
```

2. **Update your scraper provider**:
```python
# Create enhanced scraper provider
scraper_provider = EnhancedScraperProvider()
scraper_provider = ApiDojoTwitterScraper()
```

3. **Enjoy richer data**: The enhanced content is automatically used for X/Twitter requests
Expand Down
30 changes: 5 additions & 25 deletions neurons/miner.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
from common.date_range import DateRange
from scraping.scraper import ScrapeConfig, ScraperId

from scraping.x.enhanced_apidojo_scraper import EnhancedApiDojoTwitterScraper
from scraping.x.apidojo_scraper import ApiDojoTwitterScraper
import json

# Enable logging to the miner TODO move it to some different location
Expand Down Expand Up @@ -637,31 +637,11 @@ async def handle_on_demand(self, synapse: OnDemandRequest) -> OnDemandRequest:
# For X source, use the enhanced scraper directly
if synapse.source == DataSource.X:
# Initialize the enhanced scraper directly instead of using the provider

enhanced_scraper = EnhancedApiDojoTwitterScraper()
await enhanced_scraper.scrape(config)

# Get enhanced content
enhanced_content = enhanced_scraper.get_enhanced_content()

# Convert EnhancedXContent to DataEntity to maintain protocol compatibility
enhanced_data_entities = []
for content in enhanced_content:
# Convert to DataEntity but store full rich content in serialized form
api_response = content.to_api_response()
data_entity = DataEntity(
uri=content.url,
datetime=content.timestamp,
source=DataSource.X,
label=DataLabel(value=content.tweet_hashtags[0].lower()) if content.tweet_hashtags else None,
# Store the full enhanced content as serialized JSON in the content field
content=json.dumps(api_response).encode('utf-8'),
content_size_bytes=len(json.dumps(api_response))
)
enhanced_data_entities.append(data_entity)

enhanced_scraper = ApiDojoTwitterScraper()
data_entities = await enhanced_scraper.scrape(config)

# Update response with enhanced data entities
synapse.data = enhanced_data_entities[:synapse.limit] if synapse.limit else enhanced_data_entities
synapse.data = data_entities[:synapse.limit] if synapse.limit else data_entities
else:
# For Reddit, use the provider that's part of the coordinator
from scraping.provider import ScraperProvider
Expand Down
6 changes: 3 additions & 3 deletions neurons/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@
from common import constants
from common.protocol import OnDemandRequest
from common import utils
from scraping.scraper import ScrapeConfig, ValidationResult
from scraping.scraper import ScrapeConfig
from common.date_range import DateRange
from scraping.provider import ScraperProvider
from scraping.x.enhanced_apidojo_scraper import EnhancedApiDojoTwitterScraper
from scraping.x.apidojo_scraper import ApiDojoTwitterScraper
from vali_utils.miner_evaluator import MinerEvaluator
from vali_utils.load_balancer.validator_registry import ValidatorRegistry
import random
Expand Down Expand Up @@ -838,7 +838,7 @@ async def process_organic_query(self, synapse: OrganicRequest) -> OrganicRequest
# For X data, use exactly the same approach as miners
if on_demand_synapse.source == DataSource.X:
# Initialize the enhanced scraper directly as miners do
scraper = EnhancedApiDojoTwitterScraper()
scraper = ApiDojoTwitterScraper()
elif on_demand_synapse.source == DataSource.REDDIT:
# For other sources, use the standard provider
scraper_id = self.evaluator.PREFERRED_SCRAPERS.get(on_demand_synapse.source)
Expand Down
59 changes: 55 additions & 4 deletions scraping/x/apidojo_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,27 +334,62 @@ def _best_effort_parse_dataset(self, dataset: List[dict]) -> Tuple[List[XContent
is_retweet = data.get('isRetweet', False)
is_retweets.append(is_retweet)

# Extract media types for enhanced media support
media_types = self._extract_media_types(data)

# Extract author info
author = data.get('author', {})

results.append(
XContent(
username=data['author']['userName'],
# Required fields
username=f"@{data['author']['userName']}" if data.get('author', {}).get('userName') else "",
text=utils.sanitize_scraped_tweet(data['text']),
url=data["url"],
timestamp=dt.datetime.strptime(
data["createdAt"], "%a %b %d %H:%M:%S %z %Y"
),
tweet_hashtags=tags,
media=media_urls if media_urls else None,
# Enhanced fields

# Enhanced fields (existing)
user_id=user_info['id'],
user_display_name=user_info['display_name'],
user_verified=user_info['verified'],
# Non-dynamic tweet metadata
tweet_id=data.get('id'),
is_reply=data.get('isReply', None),
is_quote=data.get('isQuote', None),
# Additional metadata
conversation_id=data.get('conversationId'),
in_reply_to_user_id=reply_info[0],

# Enhanced fields for rich data
language=data.get('lang'),
full_text=data.get('fullText', data.get('text')),
is_retweet=data.get('isRetweet', False),
possibly_sensitive=data.get('possiblySensitive', False),
in_reply_to_username=reply_info[1],
quoted_tweet_id=data.get('quotedTweetId'),

# Enhanced media
media_urls=media_urls,
media_types=media_types,

# Dynamic engagement metrics
like_count=data.get('likeCount'),
retweet_count=data.get('retweetCount'),
reply_count=data.get('replyCount'),
quote_count=data.get('quoteCount'),
view_count=data.get('viewCount'),
bookmark_count=data.get('bookmarkCount'),

# User profile data
user_blue_verified=author.get('isBlueVerified', False),
user_description=author.get('description'),
user_location=author.get('location'),
profile_image_url=author.get('profilePicture'),
cover_picture_url=author.get('coverPicture'),
user_followers_count=author.get('followers'),
user_following_count=author.get('following'),
)
)
except Exception:
Expand Down Expand Up @@ -420,6 +455,22 @@ def _extract_media_urls(self, data: dict) -> List[str]:

return media_urls

def _extract_media_types(self, data: dict) -> List[str]:
"""Extract media types from tweet"""
media_data = data.get('media', [])

if not isinstance(media_data, list):
return []

media_types = []
for media_item in media_data:
if isinstance(media_item, dict):
media_types.append(media_item.get('type', 'photo'))
else:
media_types.append('photo') # Default for string URLs

return media_types

def _best_effort_parse_hf_dataset(self, dataset: List[dict]) -> List[dict]:
"""Performs a best effort parsing of Apify dataset into List[XContent]
Any errors are logged and ignored."""
Expand Down
Loading