From 1f66f126652aa5f6f2555a9b1526a61867b045ef Mon Sep 17 00:00:00 2001 From: Sung Cho <60983962+dphoria@users.noreply.github.com> Date: Mon, 26 Sep 2022 17:22:23 -0700 Subject: [PATCH] feature/filter-out-invalid-captions (#214) * caption_is_valid * addition caption validation * test * handle remote caption uri * formatting * formatting * cleanup * catch webvtt exceptions just in case * str(Path) --- cdp_backend/pipeline/event_gather_pipeline.py | 6 +- cdp_backend/tests/utils/test_file_utils.py | 41 ++++++++++++++ cdp_backend/utils/file_utils.py | 55 +++++++++++++++++++ 3 files changed, 101 insertions(+), 1 deletion(-) diff --git a/cdp_backend/pipeline/event_gather_pipeline.py b/cdp_backend/pipeline/event_gather_pipeline.py index 12dca809..e8cdc49f 100644 --- a/cdp_backend/pipeline/event_gather_pipeline.py +++ b/cdp_backend/pipeline/event_gather_pipeline.py @@ -160,7 +160,11 @@ def create_event_gather_flow( if session.caption_uri is not None: # If the caption doesn't exist, remove the property # This will result in Speech-to-Text being used instead - if not resource_exists(session.caption_uri): + if not resource_exists( + session.caption_uri + ) or not file_utils.caption_is_valid( + tmp_video_filepath, session.caption_uri + ): log.warning( f"File not found using provided caption URI: " f"'{session.caption_uri}'. " diff --git a/cdp_backend/tests/utils/test_file_utils.py b/cdp_backend/tests/utils/test_file_utils.py index 010b310a..807d14a7 100644 --- a/cdp_backend/tests/utils/test_file_utils.py +++ b/cdp_backend/tests/utils/test_file_utils.py @@ -5,9 +5,11 @@ import random import sys from pathlib import Path +from tempfile import TemporaryDirectory from typing import Optional from unittest import mock +import ffmpeg import imageio import pytest from py._path.local import LocalPath @@ -16,6 +18,7 @@ from cdp_backend.utils.file_utils import ( MAX_THUMBNAIL_HEIGHT, MAX_THUMBNAIL_WIDTH, + caption_is_valid, resource_copy, ) @@ -312,3 +315,41 @@ def test_clip_and_reformat_video( assert outfile.exists() assert outfile == expected_outfile os.remove(outfile) + + +@pytest.mark.parametrize( + "video_uri, caption_uri, end_time, is_resource, expected", + [ + # the video is about 3 minutes and boston_captions.vtt is about 1 minute + (EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 120, True, False), + (EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 60, True, True), + ( + EXAMPLE_VIDEO_FILENAME, + # about 30 seconds + "https://gist.github.com/dphoria/d3f35b5509b784ccd14b7efdc67df752/raw/" + "c18fc459c62ff7530536ba19d08021682627c18a/sample.vtt", + 30, + False, + True, + ), + ], +) +def test_caption_is_valid( + resources_dir: Path, + video_uri: str, + caption_uri: str, + end_time: int, + is_resource: bool, + expected: bool, +) -> None: + with TemporaryDirectory() as dir_path: + temp_video = str(Path(dir_path) / f"caption-test-{end_time}.mp4") + ffmpeg.input(str(resources_dir / video_uri)).output( + temp_video, codec="copy", t=end_time + ).run(overwrite_output=True) + + if is_resource: + caption_uri = str(resources_dir / caption_uri) + + print(temp_video) + assert caption_is_valid(temp_video, caption_uri) == expected diff --git a/cdp_backend/utils/file_utils.py b/cdp_backend/utils/file_utils.py index a23bd446..e9fa0ae5 100644 --- a/cdp_backend/utils/file_utils.py +++ b/cdp_backend/utils/file_utils.py @@ -7,13 +7,21 @@ import re from hashlib import sha256 from pathlib import Path +from tempfile import TemporaryDirectory from typing import Optional, Tuple, Union from uuid import uuid4 +import ffmpeg import fireo import fsspec import requests +import webvtt from fsspec.core import url_to_fs +from webvtt.exceptions import ( + InvalidCaptionsError, + MalformedCaptionError, + MalformedFileError, +) from ..database import models as db_models @@ -626,3 +634,50 @@ def clip_and_reformat_video( log.error(ffmpeg_stderr) return output_path + + +def caption_is_valid(video_uri: str, caption_uri: str) -> bool: + """ + Validate the caption file at the URI provided. + + Parameters + ---------- + video_uri: str + The URI for the the target video. + uri: str + The URI to validate caption file for. + + Returns + ------- + status: bool + The validation status. + + Notes + ----- + Duration of the video at video_uri + and the duration of the caption file are compared. + The caption file is accepted if the durations differ by no more than 20%. + """ + try: + ffprobe = ffmpeg.probe(video_uri) + except ffmpeg.Error as e: + log.warning(f"ffprobe({video_uri}): {e.stderr}") + return False + + # Making sure temp copy of the caption file is deleted when finished + with TemporaryDirectory() as dir_path: + local_caption_path = resource_copy(caption_uri, dst=dir_path) + try: + caption_length = webvtt.read(local_caption_path).total_length + except (InvalidCaptionsError, MalformedCaptionError, MalformedFileError) as e: + log.warning(f"webvtt.read({caption_uri}): {str(e)}") + return False + + similar_audio_streams = filter( + lambda s: s.get("codec_type", "") == "audio" + and math.isclose( + float(s.get("duration", "0.0")), caption_length, rel_tol=0.2 + ), + ffprobe.get("streams", []), + ) + return any(similar_audio_streams)