Update On Sat Oct 25 20:37:53 CEST 2025

This commit is contained in:
github-action[bot]
2025-10-25 20:37:53 +02:00
parent bf9e802a78
commit d01a190a31
48 changed files with 4328 additions and 3932 deletions

View File

@@ -13,12 +13,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import contextlib
import copy
import itertools
import json
from test.helper import FakeYDL, assertRegexpMatches, try_rm
from yt_dlp import YoutubeDL
from yt_dlp.extractor import YoutubeIE
from yt_dlp.extractor.common import InfoExtractor
from yt_dlp.postprocessor.common import PostProcessor
from yt_dlp.utils import (
@@ -337,99 +335,6 @@ class TestFormatSelection(unittest.TestCase):
ydl = YDL({'format': '[format_id!*=-]'})
self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
def test_youtube_format_selection(self):
# FIXME: Rewrite in accordance with the new format sorting options
return
order = [
'38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '17', '36', '13',
# Apple HTTP Live Streaming
'96', '95', '94', '93', '92', '132', '151',
# 3D
'85', '84', '102', '83', '101', '82', '100',
# Dash video
'137', '248', '136', '247', '135', '246',
'245', '244', '134', '243', '133', '242', '160',
# Dash audio
'141', '172', '140', '171', '139',
]
def format_info(f_id):
info = YoutubeIE._formats[f_id].copy()
# XXX: In real cases InfoExtractor._parse_mpd_formats() fills up 'acodec'
# and 'vcodec', while in tests such information is incomplete since
# commit a6c2c24479e5f4827ceb06f64d855329c0a6f593
# test_YoutubeDL.test_youtube_format_selection is broken without
# this fix
if 'acodec' in info and 'vcodec' not in info:
info['vcodec'] = 'none'
elif 'vcodec' in info and 'acodec' not in info:
info['acodec'] = 'none'
info['format_id'] = f_id
info['url'] = 'url:' + f_id
return info
formats_order = [format_info(f_id) for f_id in order]
info_dict = _make_result(list(formats_order), extractor='youtube')
ydl = YDL({'format': 'bestvideo+bestaudio'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], '248+172')
self.assertEqual(downloaded['ext'], 'mp4')
info_dict = _make_result(list(formats_order), extractor='youtube')
ydl = YDL({'format': 'bestvideo[height>=999999]+bestaudio/best'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], '38')
info_dict = _make_result(list(formats_order), extractor='youtube')
ydl = YDL({'format': 'bestvideo/best,bestaudio'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts]
self.assertEqual(downloaded_ids, ['137', '141'])
info_dict = _make_result(list(formats_order), extractor='youtube')
ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])+bestaudio'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts]
self.assertEqual(downloaded_ids, ['137+141', '248+141'])
info_dict = _make_result(list(formats_order), extractor='youtube')
ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])[height<=720]+bestaudio'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts]
self.assertEqual(downloaded_ids, ['136+141', '247+141'])
info_dict = _make_result(list(formats_order), extractor='youtube')
ydl = YDL({'format': '(bestvideo[ext=none]/bestvideo[ext=webm])+bestaudio'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts]
self.assertEqual(downloaded_ids, ['248+141'])
for f1, f2 in itertools.pairwise(formats_order):
info_dict = _make_result([f1, f2], extractor='youtube')
ydl = YDL({'format': 'best/bestvideo'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], f1['format_id'])
info_dict = _make_result([f2, f1], extractor='youtube')
ydl = YDL({'format': 'best/bestvideo'})
ydl.sort_formats(info_dict)
ydl.process_ie_result(info_dict)
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], f1['format_id'])
def test_audio_only_extractor_format_selection(self):
# For extractors with incomplete formats (all formats are audio-only or
# video-only) best and worst should fallback to corresponding best/worst

View File

@@ -5,12 +5,9 @@ import re
import urllib.parse
from .common import InfoExtractor
from .youtube import YoutubeBaseInfoExtractor, YoutubeIE
from ..networking import HEADRequest
from ..networking.exceptions import HTTPError
from .youtube import YoutubeBaseInfoExtractor
from ..utils import (
KNOWN_EXTENSIONS,
ExtractorError,
bug_reports_message,
clean_html,
dict_get,
@@ -21,18 +18,14 @@ from ..utils import (
join_nonempty,
js_to_json,
merge_dicts,
mimetype2ext,
orderedSet,
parse_duration,
parse_qs,
str_or_none,
str_to_int,
traverse_obj,
try_get,
unified_strdate,
unified_timestamp,
url_or_none,
urlhandle_detect_ext,
)
@@ -471,7 +464,7 @@ class YoutubeWebArchiveIE(InfoExtractor):
'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA',
'info_dict': {
'id': 'lTx3G6h2xyA',
'ext': 'flv',
'ext': 'mp4',
'title': 'Madeon - Pop Culture (live mashup)',
'upload_date': '20110711',
'uploader': 'Madeon',
@@ -578,7 +571,7 @@ class YoutubeWebArchiveIE(InfoExtractor):
'url': 'https://web.archive.org/web/20110126141719/http://www.youtube.com/watch?v=Q_yjX80U7Yc',
'info_dict': {
'id': 'Q_yjX80U7Yc',
'ext': 'flv',
'ext': 'webm',
'title': 'Spray Paint Art by Clay Butler: Purple Fantasy Forest',
'uploader_id': 'claybutlermusic',
'description': 'md5:4595264559e3d0a0ceb3f011f6334543',
@@ -680,6 +673,37 @@ class YoutubeWebArchiveIE(InfoExtractor):
'upload_date': '20120407',
'uploader_id': 'thecomputernerd01',
},
}, {
# Contains split audio/video formats
'url': 'ytarchive:o_T_S_TU12M',
'info_dict': {
'id': 'o_T_S_TU12M',
'ext': 'mp4',
'title': 'Prairie Pulse 1218; Lin Enger, Paul Olson',
'description': 'md5:36e7a34cdc8508e35a920ec042e799c7',
'uploader': 'Prairie Public',
'channel_id': 'UC4BOzQel6tvJm7OEDd3vZlw',
'channel_url': 'https://www.youtube.com/channel/UC4BOzQel6tvJm7OEDd3vZlw',
'duration': 1606,
'upload_date': '20150213',
},
}, {
# Video unavailable through wayback-fakeurl
'url': 'ytarchive:SQCom7wjGDs',
'info_dict': {
'id': 'SQCom7wjGDs',
'ext': 'mp4',
'title': 'Jamin Warren from PBS Game/Show decides that Portal is a feminist Game [Top Hats and No Brain]',
'description': 'md5:c0cb876dd075483ead9afcc86798efb0',
'uploader': 'Top Hats and Champagne',
'uploader_id': 'sparrowtm',
'uploader_url': 'https://www.youtube.com/user/sparrowtm',
'channel_id': 'UCW3T5nG4iEkI7HjG-Du3HQA',
'channel_url': 'https://www.youtube.com/channel/UCW3T5nG4iEkI7HjG-Du3HQA',
'duration': 1500,
'thumbnail': 'https://web.archive.org/web/20160108040020if_/https://i.ytimg.com/vi/SQCom7wjGDs/maxresdefault.jpg',
'upload_date': '20160107',
},
}, {
'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw',
'only_matching': True,
@@ -724,6 +748,113 @@ class YoutubeWebArchiveIE(InfoExtractor):
_OLDEST_CAPTURE_DATE = 20050214000000
_NEWEST_CAPTURE_DATE = 20500101000000
_FORMATS = {
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'vcodec': 'h263'},
'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'vcodec': 'h263'},
'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'vcodec': 'mp4v'},
'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'vcodec': 'h264'},
'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'vcodec': 'h264'},
'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'vcodec': 'h264'},
'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'vcodec': 'h264'},
# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'vcodec': 'h264'},
'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'vcodec': 'h264'},
'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'vcodec': 'vp8'},
'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'vcodec': 'vp8'},
'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'vcodec': 'vp8'},
'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'vcodec': 'vp8'},
'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'vcodec': 'h264'},
'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'vcodec': 'h264'},
# 3D videos
'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'vcodec': 'h264', 'preference': -20},
'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'vcodec': 'h264', 'preference': -20},
'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'vcodec': 'h264', 'preference': -20},
'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'vcodec': 'h264', 'preference': -20},
'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'vcodec': 'vp8', 'preference': -20},
'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'vcodec': 'vp8', 'preference': -20},
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'vcodec': 'vp8', 'preference': -20},
# Apple HTTP Live Streaming
'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'},
# DASH mp4 video
'133': {'ext': 'mp4', 'height': 240, 'vcodec': 'h264', 'acodec': 'none'},
'134': {'ext': 'mp4', 'height': 360, 'vcodec': 'h264', 'acodec': 'none'},
'135': {'ext': 'mp4', 'height': 480, 'vcodec': 'h264', 'acodec': 'none'},
'136': {'ext': 'mp4', 'height': 720, 'vcodec': 'h264', 'acodec': 'none'},
'137': {'ext': 'mp4', 'height': 1080, 'vcodec': 'h264', 'acodec': 'none'},
'138': {'ext': 'mp4', 'vcodec': 'h264', 'acodec': 'none'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
'160': {'ext': 'mp4', 'height': 144, 'vcodec': 'h264', 'acodec': 'none'},
'212': {'ext': 'mp4', 'height': 480, 'vcodec': 'h264', 'acodec': 'none'},
'264': {'ext': 'mp4', 'height': 1440, 'vcodec': 'h264', 'acodec': 'none'},
'298': {'ext': 'mp4', 'height': 720, 'vcodec': 'h264', 'fps': 60, 'acodec': 'none'},
'299': {'ext': 'mp4', 'height': 1080, 'vcodec': 'h264', 'fps': 60, 'acodec': 'none'},
'266': {'ext': 'mp4', 'height': 2160, 'vcodec': 'h264', 'acodec': 'none'},
# Dash mp4 audio
'139': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'},
'140': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'},
'141': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'},
'256': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'},
'258': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'},
'325': {'ext': 'm4a', 'acodec': 'dtse', 'vcodec': 'none'},
'328': {'ext': 'm4a', 'acodec': 'ec-3', 'vcodec': 'none'},
# Dash webm
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'vcodec': 'vp8'},
'168': {'ext': 'webm', 'height': 480, 'width': 854, 'vcodec': 'vp8'},
'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'vcodec': 'vp8'},
'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'vcodec': 'vp8'},
'218': {'ext': 'webm', 'height': 480, 'width': 854, 'vcodec': 'vp8'},
'219': {'ext': 'webm', 'height': 480, 'width': 854, 'vcodec': 'vp8'},
'278': {'ext': 'webm', 'height': 144, 'vcodec': 'vp9', 'acodec': 'none'},
'242': {'ext': 'webm', 'height': 240, 'vcodec': 'vp9', 'acodec': 'none'},
'243': {'ext': 'webm', 'height': 360, 'vcodec': 'vp9', 'acodec': 'none'},
'244': {'ext': 'webm', 'height': 480, 'vcodec': 'vp9', 'acodec': 'none'},
'245': {'ext': 'webm', 'height': 480, 'vcodec': 'vp9', 'acodec': 'none'},
'246': {'ext': 'webm', 'height': 480, 'vcodec': 'vp9', 'acodec': 'none'},
'247': {'ext': 'webm', 'height': 720, 'vcodec': 'vp9', 'acodec': 'none'},
'248': {'ext': 'webm', 'height': 1080, 'vcodec': 'vp9', 'acodec': 'none'},
'271': {'ext': 'webm', 'height': 1440, 'vcodec': 'vp9', 'acodec': 'none'},
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
'272': {'ext': 'webm', 'height': 2160, 'vcodec': 'vp9', 'acodec': 'none'},
'302': {'ext': 'webm', 'height': 720, 'vcodec': 'vp9', 'fps': 60, 'acodec': 'none'},
'303': {'ext': 'webm', 'height': 1080, 'vcodec': 'vp9', 'fps': 60, 'acodec': 'none'},
'308': {'ext': 'webm', 'height': 1440, 'vcodec': 'vp9', 'fps': 60, 'acodec': 'none'},
'313': {'ext': 'webm', 'height': 2160, 'vcodec': 'vp9', 'acodec': 'none'},
'315': {'ext': 'webm', 'height': 2160, 'vcodec': 'vp9', 'fps': 60, 'acodec': 'none'},
# Dash webm audio
'171': {'ext': 'webm', 'acodec': 'vorbis', 'vcodec': 'none'},
'172': {'ext': 'webm', 'acodec': 'vorbis', 'vcodec': 'none'},
# Dash webm audio with opus inside
'249': {'ext': 'webm', 'acodec': 'opus', 'vcodec': 'none'},
'250': {'ext': 'webm', 'acodec': 'opus', 'vcodec': 'none'},
'251': {'ext': 'webm', 'acodec': 'opus', 'vcodec': 'none'},
# av01 video only formats sometimes served with "unknown" codecs
'394': {'ext': 'mp4', 'height': 144, 'vcodec': 'av01.0.00M.08', 'acodec': 'none'},
'395': {'ext': 'mp4', 'height': 240, 'vcodec': 'av01.0.00M.08', 'acodec': 'none'},
'396': {'ext': 'mp4', 'height': 360, 'vcodec': 'av01.0.01M.08', 'acodec': 'none'},
'397': {'ext': 'mp4', 'height': 480, 'vcodec': 'av01.0.04M.08', 'acodec': 'none'},
'398': {'ext': 'mp4', 'height': 720, 'vcodec': 'av01.0.05M.08', 'acodec': 'none'},
'399': {'ext': 'mp4', 'height': 1080, 'vcodec': 'av01.0.08M.08', 'acodec': 'none'},
'400': {'ext': 'mp4', 'height': 1440, 'vcodec': 'av01.0.12M.08', 'acodec': 'none'},
'401': {'ext': 'mp4', 'height': 2160, 'vcodec': 'av01.0.12M.08', 'acodec': 'none'},
}
def _call_cdx_api(self, item_id, url, filters: list | None = None, collapse: list | None = None, query: dict | None = None, note=None, fatal=False):
# CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md
query = {
@@ -933,23 +1064,13 @@ class YoutubeWebArchiveIE(InfoExtractor):
video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2')
url_date = url_date or url_date_2
urlh = None
retry_manager = self.RetryManager(fatal=False)
for retry in retry_manager:
try:
urlh = self._request_webpage(
HEADRequest(f'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{video_id}'),
video_id, note='Fetching archived video file url', expected_status=True)
except ExtractorError as e:
# HTTP Error 404 is expected if the video is not saved.
if isinstance(e.cause, HTTPError) and e.cause.status == 404:
self.raise_no_formats(
'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True)
else:
retry.error = e
video_info = self._download_json(
'https://web.archive.org/__wb/videoinfo', video_id,
query={'vtype': 'youtube', 'vid': video_id})
if retry_manager.error:
self.raise_no_formats(retry_manager.error, expected=True, video_id=video_id)
if not traverse_obj(video_info, 'formats'):
self.raise_no_formats(
'The requested video is not archived or indexed', expected=True)
capture_dates = self._get_capture_dates(video_id, int_or_none(url_date))
self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', '))
@@ -968,25 +1089,18 @@ class YoutubeWebArchiveIE(InfoExtractor):
info['thumbnails'] = self._extract_thumbnails(video_id)
if urlh:
url = urllib.parse.unquote(urlh.url)
video_file_url_qs = parse_qs(url)
# Attempt to recover any ext & format info from playback url & response headers
fmt = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))}
itag = try_get(video_file_url_qs, lambda x: x['itag'][0])
if itag and itag in YoutubeIE._formats:
fmt.update(YoutubeIE._formats[itag])
fmt.update({'format_id': itag})
else:
mime = try_get(video_file_url_qs, lambda x: x['mime'][0])
ext = (mimetype2ext(mime)
or urlhandle_detect_ext(urlh)
or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type')))
fmt.update({'ext': ext})
info['formats'] = [fmt]
if not info.get('duration'):
info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0]))
formats = []
for fmt in traverse_obj(video_info, ('formats', lambda _, v: url_or_none(v['url']))):
format_id = traverse_obj(fmt, ('url', {parse_qs}, 'itag', 0))
formats.append({
'format_id': format_id,
**self._FORMATS.get(format_id, {}),
**traverse_obj(fmt, {
'url': ('url', {lambda x: f'https://web.archive.org/web/2id_/{x}'}),
'ext': ('ext', {str}),
'filesize': ('url', {parse_qs}, 'clen', 0, {int_or_none}),
}),
})
info['formats'] = formats
if not info.get('title'):
info['title'] = video_id
return info

View File

@@ -1,21 +1,19 @@
import re
import urllib.parse
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import (
ExtractorError,
bug_reports_message,
determine_ext,
extract_attributes,
get_element_by_class,
get_element_html_by_id,
int_or_none,
lowercase_escape,
parse_qs,
try_get,
mimetype2ext,
parse_duration,
str_or_none,
update_url_query,
url_or_none,
)
from ..utils.traversal import traverse_obj, value
class GoogleDriveIE(InfoExtractor):
@@ -38,8 +36,8 @@ class GoogleDriveIE(InfoExtractor):
'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
'ext': 'mp4',
'title': 'Big Buck Bunny.mp4',
'duration': 45,
'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ',
'duration': 45.069,
'thumbnail': r're:https://lh3\.googleusercontent\.com/drive-storage/',
},
}, {
# has itag 50 which is not in YoutubeIE._formats (royalty Free music from 1922)
@@ -49,8 +47,18 @@ class GoogleDriveIE(InfoExtractor):
'id': '1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
'ext': 'mp3',
'title': 'My Buddy - Henry Burr - Gus Kahn - Walter Donaldson.mp3',
'duration': 184,
'thumbnail': 'https://drive.google.com/thumbnail?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
'duration': 184.68,
},
}, {
# Has subtitle track
'url': 'https://drive.google.com/file/d/1RAGWRgzn85TXCaCk4gxnwF6TGUaZatzE/view',
'md5': '05488c528da6ef737ec8c962bfa9724e',
'info_dict': {
'id': '1RAGWRgzn85TXCaCk4gxnwF6TGUaZatzE',
'ext': 'mp4',
'title': 'test.mp4',
'duration': 9.999,
'thumbnail': r're:https://lh3\.googleusercontent\.com/drive-storage/',
},
}, {
# video can't be watched anonymously due to view count limit reached,
@@ -71,17 +79,6 @@ class GoogleDriveIE(InfoExtractor):
'url': 'https://drive.usercontent.google.com/download?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ',
'only_matching': True,
}]
_FORMATS_EXT = {
**{k: v['ext'] for k, v in YoutubeIE._formats.items() if v.get('ext')},
'50': 'm4a',
}
_BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
_CAPTIONS_ENTRY_TAG = {
'subtitles': 'track',
'automatic_captions': 'target',
}
_caption_formats_ext = []
_captions_xml = None
@classmethod
def _extract_embed_urls(cls, url, webpage):
@@ -91,129 +88,71 @@ class GoogleDriveIE(InfoExtractor):
if mobj:
yield 'https://drive.google.com/file/d/{}'.format(mobj.group('id'))
def _download_subtitles_xml(self, video_id, subtitles_id, hl):
if self._captions_xml:
return
self._captions_xml = self._download_xml(
self._BASE_URL_CAPTIONS, video_id, query={
'id': video_id,
'vid': subtitles_id,
'hl': hl,
@staticmethod
def _construct_subtitle_url(base_url, video_id, language, fmt):
return update_url_query(
base_url, {
'hl': 'en-US',
'v': video_id,
'type': 'track',
'lang': language,
'fmt': fmt,
})
def _get_subtitles(self, video_id, video_info):
subtitles = {}
timed_text_base_url = traverse_obj(video_info, ('timedTextDetails', 'timedTextBaseUrl', {url_or_none}))
if not timed_text_base_url:
return subtitles
subtitle_data = self._download_xml(
timed_text_base_url, video_id, 'Downloading subtitles XML', fatal=False, query={
'hl': 'en-US',
'type': 'list',
'tlangs': '1',
'fmts': '1',
'vssids': '1',
}, note='Downloading subtitles XML',
errnote='Unable to download subtitles XML', fatal=False)
if self._captions_xml:
for f in self._captions_xml.findall('format'):
if f.attrib.get('fmt_code') and not f.attrib.get('default'):
self._caption_formats_ext.append(f.attrib['fmt_code'])
def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
origin_lang_code=None, origin_lang_name=None):
if not subtitles_id or not caption_type:
return
captions = {}
for caption_entry in self._captions_xml.findall(
self._CAPTIONS_ENTRY_TAG[caption_type]):
caption_lang_code = caption_entry.attrib.get('lang_code')
caption_name = caption_entry.attrib.get('name') or origin_lang_name
if not caption_lang_code or not caption_name:
self.report_warning(f'Missing necessary caption metadata. '
f'Need lang_code and name attributes. '
f'Found: {caption_entry.attrib}')
continue
caption_format_data = []
for caption_format in self._caption_formats_ext:
query = {
'vid': subtitles_id,
'v': video_id,
'fmt': caption_format,
'lang': (caption_lang_code if origin_lang_code is None
else origin_lang_code),
'type': 'track',
'name': caption_name,
'kind': '',
}
if origin_lang_code is not None:
query.update({'tlang': caption_lang_code})
caption_format_data.append({
'url': update_url_query(self._BASE_URL_CAPTIONS, query),
'ext': caption_format,
})
captions[caption_lang_code] = caption_format_data
return captions
def _get_subtitles(self, video_id, subtitles_id, hl):
if not subtitles_id or not hl:
return
self._download_subtitles_xml(video_id, subtitles_id, hl)
if not self._captions_xml:
return
return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
def _get_automatic_captions(self, video_id, subtitles_id, hl):
if not subtitles_id or not hl:
return
self._download_subtitles_xml(video_id, subtitles_id, hl)
if not self._captions_xml:
return
track = next((t for t in self._captions_xml.findall('track') if t.attrib.get('cantran') == 'true'), None)
if track is None:
return
origin_lang_code = track.attrib.get('lang_code')
origin_lang_name = track.attrib.get('name')
if not origin_lang_code or not origin_lang_name:
return
return self._get_captions_by_type(
video_id, subtitles_id, 'automatic_captions', origin_lang_code, origin_lang_name)
'tlangs': 1,
'v': video_id,
'vssids': 1,
})
subtitle_formats = traverse_obj(subtitle_data, (lambda _, v: v.tag == 'format', {lambda x: x.get('fmt_code')}, {str}))
for track in traverse_obj(subtitle_data, (lambda _, v: v.tag == 'track' and v.get('lang_code'))):
language = track.get('lang_code')
subtitles.setdefault(language, []).extend([{
'url': self._construct_subtitle_url(timed_text_base_url, video_id, language, sub_fmt),
'name': track.get('lang_original'),
'ext': sub_fmt,
} for sub_fmt in subtitle_formats])
return subtitles
def _real_extract(self, url):
video_id = self._match_id(url)
video_info = urllib.parse.parse_qs(self._download_webpage(
'https://drive.google.com/get_video_info',
video_id, 'Downloading video webpage', query={'docid': video_id}))
def get_value(key):
return try_get(video_info, lambda x: x[key][0])
reason = get_value('reason')
title = get_value('title')
video_info = self._download_json(
f'https://content-workspacevideo-pa.googleapis.com/v1/drive/media/{video_id}/playback',
video_id, 'Downloading video webpage', query={'key': 'AIzaSyDVQw45DwoYh632gvsP5vPDqEKvb-Ywnb8'},
headers={'Referer': 'https://drive.google.com/'})
formats = []
fmt_stream_map = (get_value('fmt_stream_map') or '').split(',')
fmt_list = (get_value('fmt_list') or '').split(',')
if fmt_stream_map and fmt_list:
resolutions = {}
for fmt in fmt_list:
mobj = re.search(
r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
if mobj:
resolutions[mobj.group('format_id')] = (
int(mobj.group('width')), int(mobj.group('height')))
for fmt in traverse_obj(video_info, (
'mediaStreamingData', 'formatStreamingData', ('adaptiveTranscodes', 'progressiveTranscodes'),
lambda _, v: url_or_none(v['url']))):
formats.append({
**traverse_obj(fmt, {
'url': 'url',
'format_id': ('itag', {int}, {str_or_none}),
}),
**traverse_obj(fmt, ('transcodeMetadata', {
'ext': ('mimeType', {mimetype2ext}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'fps': ('videoFps', {int_or_none}),
'filesize': ('contentLength', {int_or_none}),
'vcodec': ((('videoCodecString', {str}), {value('none')}), any),
'acodec': ((('audioCodecString', {str}), {value('none')}), any),
})),
'downloader_options': {
'http_chunk_size': 10 << 20,
},
})
for fmt_stream in fmt_stream_map:
fmt_stream_split = fmt_stream.split('|')
if len(fmt_stream_split) < 2:
continue
format_id, format_url = fmt_stream_split[:2]
ext = self._FORMATS_EXT.get(format_id)
if not ext:
self.report_warning(f'Unknown format {format_id}{bug_reports_message()}')
f = {
'url': lowercase_escape(format_url),
'format_id': format_id,
'ext': ext,
}
resolution = resolutions.get(format_id)
if resolution:
f.update({
'width': resolution[0],
'height': resolution[1],
})
formats.append(f)
title = traverse_obj(video_info, ('mediaMetadata', 'title', {str}))
source_url = update_url_query(
'https://drive.usercontent.google.com/download', {
@@ -264,30 +203,20 @@ class GoogleDriveIE(InfoExtractor):
or get_element_by_class('uc-error-caption', confirmation_webpage)
or 'unable to extract confirmation code')
if not formats and reason:
if title:
self.raise_no_formats(reason, expected=True)
else:
raise ExtractorError(reason, expected=True)
hl = get_value('hl')
subtitles_id = None
ttsurl = get_value('ttsurl')
if ttsurl:
# the subtitles ID is the vid param of the ttsurl query
subtitles_id = parse_qs(ttsurl).get('vid', [None])[-1]
self.cookiejar.clear(domain='.google.com', path='/', name='NID')
return {
'id': video_id,
'title': title,
'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id,
'duration': int_or_none(get_value('length_seconds')),
**traverse_obj(video_info, {
'duration': ('mediaMetadata', 'duration', {parse_duration}),
'thumbnails': ('thumbnails', lambda _, v: url_or_none(v['url']), {
'url': 'url',
'ext': ('mimeType', {mimetype2ext}),
'width': ('width', {int}),
'height': ('height', {int}),
}),
}),
'formats': formats,
'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
'automatic_captions': self.extract_automatic_captions(
video_id, subtitles_id, hl),
'subtitles': self.extract_subtitles(video_id, video_info),
}

View File

@@ -147,115 +147,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
)
_formats = { # NB: Used in YoutubeWebArchiveIE and GoogleDriveIE
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
# 3D videos
'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
# Apple HTTP Live Streaming
'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
# DASH mp4 video
'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
# Dash mp4 audio
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
# Dash webm
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
# Dash webm audio
'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
# Dash webm audio with opus inside
'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
# RTMP (unnamed)
'_rtmp': {'protocol': 'rtmp'},
# av01 video only formats sometimes served with "unknown" codecs
'394': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'},
'395': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'},
'396': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'av01.0.01M.08'},
'397': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'av01.0.04M.08'},
'398': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'av01.0.05M.08'},
'399': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'av01.0.08M.08'},
'400': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
'401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
}
_SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'srt', 'vtt')
_DEFAULT_CLIENTS = ('android_sdkless', 'tv', 'web_safari', 'web')
_DEFAULT_AUTHED_CLIENTS = ('tv', 'web_safari', 'web')