Skip to content

Commit 46516fc

Browse files
committed
Initial work on yt-dlp
1 parent 5cdc50e commit 46516fc

File tree

5 files changed

+67
-29
lines changed

5 files changed

+67
-29
lines changed

PythonRpcServer/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
venv/

PythonRpcServer/requirements.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ wcwidth==0.2.13
3232

3333
# Not versioned
3434
numpy
35-
pytube # if not available, use the tar.gz package (see Dockerfile)
36-
35+
# No longer maintained pytube # if not available, use the tar.gz package (see Dockerfile)
36+
yt-dlp
37+
#Always get latest
3738

3839
# protobuf version 3.18.3 causes NotImplementedError("To be implemented") in PythonRpcServer/mediaprovider.py
3940
# Likely need to coordinate updating the C# version too

PythonRpcServer/youtube.py

Lines changed: 52 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
1-
from pytube.extract import playlist_id
1+
# from pytube.extract import playlist_id
2+
3+
# from yt_dlp import YoutubeDL
4+
import yt_dlp
5+
26
import requests
3-
from utils import encode, decode, getRandomString, download_file
7+
from utils import getRandomString
48
import os
59
import json
610
from time import perf_counter
11+
import datetime
712

813
#from pytube import YouTube
9-
import pytube
14+
# import pytube
1015

1116
from mediaprovider import MediaProvider, InvalidPlaylistInfoException
1217

@@ -42,7 +47,10 @@ def get_youtube_channel(self, identifier):
4247
print(f'get_youtube_channel({identifier})')
4348

4449
url = YOUTUBE_CHANNEL_BASE_URL+ identifier
45-
channel = pytube.Channel(url)
50+
# Use yt_dlp to create a channel,
51+
52+
channel = yt_dlp.Youtube(url).get_channel()
53+
## channel.playlist_id = channel.playlist_id.replace('UC', 'UU')
4654

4755
playlist_id = channel.playlist_id
4856
#according to one StackOver and one test, channels-to-playlists can also be converted with string replace UCXXXX to UUXXXX
@@ -53,26 +61,33 @@ def get_youtube_playlist(self, identifier):
5361
try:
5462
start_time = perf_counter()
5563

56-
url= YOUTUBE_PLAYLIST_BASE_URL+ identifier
64+
url= YOUTUBE_PLAYLIST_BASE_URL + identifier
5765
print(f"get_youtube_playlist(identifier): {url}")
58-
playlist = pytube.Playlist(url)
59-
66+
67+
ydl_opts = {
68+
'quiet': True,
69+
'extract_flat': 'in_playlist', # Ensure we are extracting playlist entries
70+
'force_generic_extractor': True,
71+
}
6072
medias = []
61-
for v in playlist.videos:
62-
63-
published_at = v.publish_date.strftime('%Y/%m/%d')
64-
media = {
65-
#"channelTitle": channelTitle,
66-
"channelId": v.channel_id,
67-
"playlistId": identifier,
68-
"title": v.title,
69-
"description": v.description,
70-
"publishedAt": published_at,
71-
"videoUrl": v.watch_url,
72-
"videoId": v.video_id,
73-
"createdAt": published_at
74-
}
75-
medias.append(media)
73+
# Current time in YYYYMMDD format
74+
now = datetime.datetime.now().strftime('%Y%m%d')
75+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
76+
info_dict = ydl.extract_info(url, download=False)
77+
for entry in info_dict.get( 'entries', []):
78+
print(entry)
79+
published_at = entry.get('upload_date', now)
80+
media = {
81+
"channelId": entry['channel_id'],
82+
"playlistId": identifier,
83+
"title": entry['title'],
84+
"description": entry['description'],
85+
"publishedAt": published_at,
86+
"videoUrl": "https://youtube.com/watch?v="+entry['id'],
87+
"videoId": entry['id'],
88+
"createdAt": published_at
89+
}
90+
medias.append(media)
7691
end_time = perf_counter()
7792
print(f'Youtube playlist {identifier}: Returning {len(medias)} items. Processing time {end_time - start_time :.2f} seconds')
7893
return medias
@@ -86,7 +101,21 @@ def download_youtube_video(self, youtubeUrl):
86101
start_time = perf_counter()
87102
extension = '.mp4'
88103
filename = getRandomString(8)
89-
filepath = pytube.YouTube(youtubeUrl).streams.filter(subtype='mp4').get_highest_resolution().download(output_path = DATA_DIRECTORY, filename = filename)
104+
filepath =f'{DATA_DIRECTORY}/{filename}'
105+
ydl_opts = {
106+
'quiet': True,
107+
'format': 'best[ext=mp4]',
108+
'outtmpl': filepath,
109+
'cachedir' : False,
110+
'progress_hooks': [],
111+
'call_home': False,
112+
'no_color': True,
113+
'noprogress': True,
114+
}
115+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
116+
x = ydl.download([youtubeUrl])
117+
print(x)
118+
#filepath = yt_dlp.YoutubeDL(ydl_opts).streams.filter(subtype='mp4').get_highest_resolution().download(output_path = DATA_DIRECTORY, filename = filename)
90119
end_time = perf_counter()
91120
print(f"download_youtube_video({youtubeUrl}): Done. Downloaded in {end_time - start_time :.2f} seconds")
92121
return filepath, extension

PythonRpcServer/youtube_test.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import youtube
77

8-
def test_youtube():
8+
def test_youtube1():
99
print("Test 1/2: Download playlist")
1010
yt=youtube.YoutubeProvider()
1111
pl=yt.get_youtube_playlist('PLBgxzZMu3GpPb35BDIU5eeopR4MhBOZw_')
@@ -17,7 +17,9 @@ def test_youtube():
1717

1818
assert 'STAT 385' in pl[0]['title']
1919

20+
def test_youtube2():
2021
print("Test 2/2: Download video")
22+
yt=youtube.YoutubeProvider()
2123
onevid = yt.download_youtube_video('https://youtube.com/watch?v=DqHMh8nqCPw') # 24-72 seconds typical
2224
print(onevid)
2325
assert len(onevid) == 2
@@ -34,4 +36,5 @@ def test_youtube():
3436
print("All tests completed")
3537

3638
if __name__ == "__main__":
37-
test_youtube()
39+
test_youtube1()
40+
test_youtube2()

pythonrpcserver.Dockerfile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@
2323
COPY --from=whisperbuild /whisper.cpp/models /PythonRpcServer/models
2424
WORKDIR /PythonRpcServer
2525

26+
# Don't copy any py files here, so that we don't need to re-run whisper
27+
COPY ./PythonRpcServer/transcribe_hellohellohello.wav .
28+
# The output of tis whisper run is used when we set MOCK_RECOGNITION=MOCK for quick testing
29+
RUN whisper -ojf -f transcribe_hellohellohello.wav
30+
2631
COPY ./PythonRpcServer/requirements.txt requirements.txt
2732
RUN pip install --no-cache-dir --upgrade pip && \
2833
pip install --no-cache-dir -r requirements.txt
@@ -31,8 +36,7 @@
3136
RUN python -m grpc_tools.protoc -I . --python_out=./ --grpc_python_out=./ ct.proto
3237

3338
COPY ./PythonRpcServer .
34-
# The output of this file is used when we set MOCK_RECOGNITION=MOCK for quick testing
35-
RUN whisper -ojf -f transcribe_hellohellohello.wav
39+
3640

3741
CMD [ "nice", "-n", "18", "ionice", "-c", "2", "-n", "6", "python3", "-u", "/PythonRpcServer/server.py" ]
3842

0 commit comments

Comments
 (0)