diff --git a/tubesync/sync/models.py b/tubesync/sync/models.py index 2037492d2..a65abdf85 100644 --- a/tubesync/sync/models.py +++ b/tubesync/sync/models.py @@ -19,7 +19,7 @@ from .youtube import (get_media_info as get_youtube_media_info, download_media as download_youtube_media, get_channel_image_info as get_youtube_channel_image_info) -from .utils import seconds_to_timestr, parse_media_format +from .utils import seconds_to_timestr, parse_media_format, filter_response from .matching import (get_best_combined_format, get_best_audio_format, get_best_video_format) from .mediaservers import PlexMediaServer @@ -1143,8 +1143,39 @@ def format_dict(self): def has_metadata(self): return self.metadata is not None + + @property + def reduce_data(self): + try: + from common.logger import log + from common.utils import json_serial + + old_mdl = len(self.metadata or "") + data = json.loads(self.metadata or "") + compact_json = json.dumps(data, separators=(',', ':'), default=json_serial) + + filtered_data = filter_response(data, True) + filtered_json = json.dumps(filtered_data, separators=(',', ':'), default=json_serial) + except Exception as e: + log.exception('reduce_data: %s', e) + else: + # log the results of filtering / compacting on metadata size + new_mdl = len(compact_json) + if old_mdl > new_mdl: + delta = old_mdl - new_mdl + log.info(f'{self.key}: metadata compacted by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})') + new_mdl = len(filtered_json) + if old_mdl > new_mdl: + delta = old_mdl - new_mdl + log.info(f'{self.key}: metadata reduced by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})') + if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False): + self.metadata = filtered_json + + @property def loaded_metadata(self): + if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False): + self.reduce_data try: data = json.loads(self.metadata) if not isinstance(data, dict): diff --git a/tubesync/sync/tasks.py b/tubesync/sync/tasks.py index b36b5d493..ab92e2c89 100644 --- a/tubesync/sync/tasks.py +++ b/tubesync/sync/tasks.py @@ -26,7 +26,7 @@ from common.utils import json_serial from .models import Source, Media, MediaServer from .utils import (get_remote_image, resize_image_to_height, delete_file, - write_text_file) + write_text_file, filter_response) from .filtering import filter_media @@ -304,7 +304,10 @@ def download_media_metadata(media_id): return source = media.source metadata = media.index_metadata() - media.metadata = json.dumps(metadata, default=json_serial) + response = metadata + if getattr(settings, 'SHRINK_NEW_MEDIA_METADATA', False): + response = filter_response(metadata, True) + media.metadata = json.dumps(response, separators=(',', ':'), default=json_serial) upload_date = media.upload_date # Media must have a valid upload date if upload_date: diff --git a/tubesync/sync/tests.py b/tubesync/sync/tests.py index 8f0de6efd..2704058fe 100644 --- a/tubesync/sync/tests.py +++ b/tubesync/sync/tests.py @@ -18,6 +18,7 @@ from .models import Source, Media from .tasks import cleanup_old_media from .filtering import filter_media +from .utils import filter_response class FrontEndTestCase(TestCase): @@ -1709,6 +1710,84 @@ def test_is_regex_match(self): f'expected {expected_match_result}') +class ResponseFilteringTestCase(TestCase): + + def setUp(self): + # Disable general logging for test case + logging.disable(logging.CRITICAL) + # Add a test source + self.source = Source.objects.create( + source_type=Source.SOURCE_TYPE_YOUTUBE_CHANNEL, + key='testkey', + name='testname', + directory='testdirectory', + index_schedule=3600, + delete_old_media=False, + days_to_keep=14, + source_resolution=Source.SOURCE_RESOLUTION_1080P, + source_vcodec=Source.SOURCE_VCODEC_VP9, + source_acodec=Source.SOURCE_ACODEC_OPUS, + prefer_60fps=False, + prefer_hdr=False, + fallback=Source.FALLBACK_FAIL + ) + # Add some media + self.media = Media.objects.create( + key='mediakey', + source=self.source, + metadata='{}' + ) + + def test_metadata_20230629(self): + self.media.metadata = all_test_metadata['20230629'] + self.media.save() + + unfiltered = self.media.loaded_metadata + filtered = filter_response(self.media.loaded_metadata) + self.assertIn('formats', unfiltered.keys()) + self.assertIn('formats', filtered.keys()) + # filtered 'downloader_options' + self.assertIn('downloader_options', unfiltered['formats'][10].keys()) + self.assertNotIn('downloader_options', filtered['formats'][10].keys()) + # filtered 'http_headers' + self.assertIn('http_headers', unfiltered['formats'][0].keys()) + self.assertNotIn('http_headers', filtered['formats'][0].keys()) + # did not lose any formats + self.assertEqual(48, len(unfiltered['formats'])) + self.assertEqual(48, len(filtered['formats'])) + self.assertEqual(len(unfiltered['formats']), len(filtered['formats'])) + # did not remove everything with url + self.assertIn('original_url', unfiltered.keys()) + self.assertIn('original_url', filtered.keys()) + self.assertEqual(unfiltered['original_url'], filtered['original_url']) + # did reduce the size of the metadata + self.assertTrue(len(str(filtered)) < len(str(unfiltered))) + + url_keys = [] + for format in unfiltered['formats']: + for key in format.keys(): + if 'url' in key: + url_keys.append((format['format_id'], key, format[key],)) + unfiltered_url_keys = url_keys + self.assertEqual(63, len(unfiltered_url_keys), msg=str(unfiltered_url_keys)) + + url_keys = [] + for format in filtered['formats']: + for key in format.keys(): + if 'url' in key: + url_keys.append((format['format_id'], key, format[key],)) + filtered_url_keys = url_keys + self.assertEqual(3, len(filtered_url_keys), msg=str(filtered_url_keys)) + + url_keys = [] + for lang_code, captions in filtered['automatic_captions'].items(): + for caption in captions: + for key in caption.keys(): + if 'url' in key: + url_keys.append((lang_code, caption['ext'], caption[key],)) + self.assertEqual(0, len(url_keys), msg=str(url_keys)) + + class TasksTestCase(TestCase): def setUp(self): diff --git a/tubesync/sync/utils.py b/tubesync/sync/utils.py index 73c8f3947..1d67af38d 100644 --- a/tubesync/sync/utils.py +++ b/tubesync/sync/utils.py @@ -1,6 +1,7 @@ import os import re import math +from copy import deepcopy from operator import itemgetter from pathlib import Path from tempfile import NamedTemporaryFile @@ -171,6 +172,95 @@ def normalize_codec(codec_str): return result +def _url_keys(arg_dict, filter_func): + result = {} + for key in arg_dict.keys(): + if 'url' in key: + result.update( + {key: filter_func(key=key, url=arg_dict[key])} + ) + return result + + +def _drop_url_keys(arg_dict, key, filter_func): + if key in arg_dict.keys(): + for val_dict in arg_dict[key]: + for url_key, remove in _url_keys(val_dict, filter_func).items(): + if remove is True: + del val_dict[url_key] + + +def filter_response(arg_dict, copy_arg=False): + ''' + Clean up the response so as to not store useless metadata in the database. + ''' + response_dict = arg_dict + # raise an exception for an unexpected argument type + if not isinstance(response_dict, dict): + raise TypeError(f'response_dict must be a dict, got "{type(response_dict)}"') + + if copy_arg: + response_dict = deepcopy(arg_dict) + + # optimize the empty case + if not response_dict: + return response_dict + + # beginning of formats cleanup {{{ + # drop urls that expire, or restrict IPs + def drop_format_url(**kwargs): + url = kwargs['url'] + return ( + url + and '://' in url + and ( + '/ip/' in url + or 'ip=' in url + or '/expire/' in url + or 'expire=' in url + ) + ) + + # these format keys are not useful to us + drop_keys = frozenset(( + 'downloader_options', + 'fragments', + 'http_headers', + '__needs_testing', + '__working', + )) + for key in frozenset(('formats', 'requested_formats',)): + _drop_url_keys(response_dict, key, drop_format_url) + if key in response_dict.keys(): + for format in response_dict[key]: + for drop_key in drop_keys: + if drop_key in format.keys(): + del format[drop_key] + # end of formats cleanup }}} + + # beginning of subtitles cleanup {{{ + # drop urls that expire + def drop_subtitles_url(**kwargs): + url = kwargs['url'] + return ( + url + and '://' in url + and ( + '/expire/' in url + or '&expire=' in url + ) + ) + + for key in frozenset(('subtitles', 'automatic_captions',)): + if key in response_dict.keys(): + key_dict = response_dict[key] + for lang_code in key_dict: + _drop_url_keys(key_dict, lang_code, drop_subtitles_url) + # end of subtitles cleanup }}} + + return response_dict + + def parse_media_format(format_dict): ''' This parser primarily adapts the format dict returned by youtube-dl into a