Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions scraping/youtube/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import hashlib
import re
import unicodedata
from typing import Dict, List, Optional
from pydantic.v1 import BaseModel, Field
from typing import Dict, List, Optional, Union
from pydantic.v1 import BaseModel, Field, validator
from common.data import DataEntity, DataLabel, DataSource
from common.constants import YOUTUBE_TRANSCRIPT_END_FIELD_REQUIRED_DATE
from scraping import utils


Expand Down Expand Up @@ -60,6 +61,28 @@ def normalize_channel_name(name: str, max_len: int = 50) -> str:
return f"chan-{hash_suffix}"


class TranscriptSegmentWithEnd(BaseModel):
"""Transcript segment using 'end' field (standard format)."""

class Config:
extra = "forbid"

text: str = Field(description="The transcript text for this segment")
start: float = Field(description="Start time in seconds")
end: float = Field(description="End time in seconds")


class TranscriptSegmentWithDuration(BaseModel):
"""Transcript segment using 'duration' field (deprecated, grace period until Oct 8, 2025)."""

class Config:
extra = "forbid"

text: str = Field(description="The transcript text for this segment")
start: float = Field(description="Start time in seconds")
duration: float = Field(description="Duration in seconds (deprecated)")


class YouTubeContent(BaseModel):
"""The content model for YouTube transcripts with language support."""

Expand Down
84 changes: 83 additions & 1 deletion scraping/youtube/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from scraping.scraper import ValidationResult
from common.data import DataEntity
from common.constants import YOUTUBE_TIMESTAMP_OBFUSCATION_REQUIRED_DATE, YOUTUBE_TRANSCRIPT_END_FIELD_REQUIRED_DATE
from scraping.youtube.model import TranscriptSegmentWithEnd, TranscriptSegmentWithDuration
from .model import YouTubeContent
from .model import normalize_channel_name

Expand Down Expand Up @@ -240,6 +241,70 @@ def validate_youtube_data_entity_fields(actual_content: YouTubeContent, entity:
)


def validate_transcript_segment_fields(
transcript: List[Dict],
entity: DataEntity
) -> Optional[ValidationResult]:
"""
Validate that transcript segments only contain allowed fields and no extras.
Prevents miners from padding segments with spam fields to inflate byte count.

Args:
transcript: List of transcript segment dictionaries
entity: DataEntity being validated

Returns:
ValidationResult if validation fails, None if passes
"""

now = dt.datetime.now(dt.timezone.utc)
grace_period_active = now < YOUTUBE_TRANSCRIPT_END_FIELD_REQUIRED_DATE

for i, segment in enumerate(transcript):
if not isinstance(segment, dict):
return ValidationResult(
is_valid=False,
reason=f"Transcript segment {i} is not a dictionary",
content_size_bytes_validated=entity.content_size_bytes,
)

# Try to parse segment using Pydantic models to enforce schema
# This will reject any extra fields due to extra="forbid"
try:
has_end = 'end' in segment
has_duration = 'duration' in segment

if has_end:
# Try to parse as TranscriptSegmentWithEnd
TranscriptSegmentWithEnd(**segment)
elif has_duration and grace_period_active:
# During grace period, allow duration format
TranscriptSegmentWithDuration(**segment)
elif has_duration and not grace_period_active:
# After grace period, duration is not allowed
return ValidationResult(
is_valid=False,
reason=f"Transcript segment {i} uses deprecated 'duration' field (grace period expired)",
content_size_bytes_validated=entity.content_size_bytes,
)
else:
return ValidationResult(
is_valid=False,
reason=f"Transcript segment {i} missing required timing fields",
content_size_bytes_validated=entity.content_size_bytes,
)
except Exception as e:
# Pydantic validation failed - likely due to extra fields or wrong types
bt.logging.info(f"Transcript segment {i} validation failed: {str(e)}")
return ValidationResult(
is_valid=False,
reason=f"Transcript segment {i} has invalid structure or extra fields: {str(e)}",
content_size_bytes_validated=entity.content_size_bytes,
)

return None # All segments are valid


def validate_transcript_timing(
transcript: List[Dict],
video_duration_seconds: int,
Expand Down Expand Up @@ -428,7 +493,15 @@ def validate_youtube_data_entities(
content_size_bytes_validated=entity_to_validate.content_size_bytes
)

# Step 5.5: Validate transcript timing structure (anti-cheating)
# Step 5.5: Validate transcript segments don't have extra fields (anti-padding)
segment_fields_validation = validate_transcript_segment_fields(
content_to_validate.transcript,
entity_to_validate
)
if segment_fields_validation is not None:
return segment_fields_validation

# Step 5.6: Validate transcript timing structure (anti-cheating)
timing_validation = validate_transcript_timing(
content_to_validate.transcript,
content_to_validate.duration_seconds,
Expand All @@ -437,6 +510,15 @@ def validate_youtube_data_entities(
if timing_validation is not None:
return timing_validation

# Step 5.7: Validate content size (prevent byte inflation with extra fields)
byte_difference_allowed = 20 # Allow small differences for encoding/formatting variations
if (entity_to_validate.content_size_bytes - actual_entity.content_size_bytes) > byte_difference_allowed:
return ValidationResult(
is_valid=False,
reason=f"Claimed bytes ({entity_to_validate.content_size_bytes}) exceed actual content size ({actual_entity.content_size_bytes}) by more than {byte_difference_allowed} bytes",
content_size_bytes_validated=entity_to_validate.content_size_bytes,
)

# Step 6: Ensure both DataEntity datetime fields are obfuscated before comparison
entity_to_validate_obfuscated = entity_to_validate.model_copy(update={
'datetime': utils.obfuscate_datetime_to_minute(entity_to_validate.datetime)
Expand Down