From d9e8130d94fce46fa2d554ec523e7b6c80e5f98c Mon Sep 17 00:00:00 2001 From: Sung Cho Date: Thu, 22 Sep 2022 23:05:31 -0700 Subject: [PATCH 1/9] caption_is_valid --- cdp_backend/utils/file_utils.py | 39 +++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/cdp_backend/utils/file_utils.py b/cdp_backend/utils/file_utils.py index a23bd446..d8e61397 100644 --- a/cdp_backend/utils/file_utils.py +++ b/cdp_backend/utils/file_utils.py @@ -10,9 +10,11 @@ from typing import Optional, Tuple, Union from uuid import uuid4 +import ffmpeg import fireo import fsspec import requests +import webvtt from fsspec.core import url_to_fs from ..database import models as db_models @@ -626,3 +628,40 @@ def clip_and_reformat_video( log.error(ffmpeg_stderr) return output_path + + +def caption_is_valid(video_uri: str, caption_uri: str) -> bool: + """ + Validate the caption file at the URI provided. + + Parameters + ---------- + video_uri: str + The URI for the the target video. + uri: str + The URI to validate caption file for. + + Returns + ------- + status: bool + The validation status. + + Notes + ----- + Duration of the video at video_uri + and the duration of the caption file are compared. + The caption file is accepted if the durations differ by no more than 20%. + """ + try: + ffprobe = ffmpeg.probe(video_uri) + except ffmpeg.Error as e: + log.warning(f"ffprobe{video_uri}): {e.stderr}") + return False + + caption_length = webvtt.read(caption_uri).total_length + similar_audio_streams = filter( + lambda s: s.get("codec_type", "") == "audio" + and math.isclose(float(s.get("duration", "0.0")), caption_length, rel_tol=0.2), + ffprobe.get("streams", []), + ) + return any(similar_audio_streams) From af4ac43855e5269c1fc89ae27828e6e2e9589d02 Mon Sep 17 00:00:00 2001 From: Sung Cho Date: Thu, 22 Sep 2022 23:05:48 -0700 Subject: [PATCH 2/9] addition caption validation --- cdp_backend/pipeline/event_gather_pipeline.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cdp_backend/pipeline/event_gather_pipeline.py b/cdp_backend/pipeline/event_gather_pipeline.py index 12dca809..e8cdc49f 100644 --- a/cdp_backend/pipeline/event_gather_pipeline.py +++ b/cdp_backend/pipeline/event_gather_pipeline.py @@ -160,7 +160,11 @@ def create_event_gather_flow( if session.caption_uri is not None: # If the caption doesn't exist, remove the property # This will result in Speech-to-Text being used instead - if not resource_exists(session.caption_uri): + if not resource_exists( + session.caption_uri + ) or not file_utils.caption_is_valid( + tmp_video_filepath, session.caption_uri + ): log.warning( f"File not found using provided caption URI: " f"'{session.caption_uri}'. " From 6c8eb3b9f3f6d4773bd5295c3ee4e9b7e773fb5c Mon Sep 17 00:00:00 2001 From: Sung Cho Date: Thu, 22 Sep 2022 23:05:56 -0700 Subject: [PATCH 3/9] test --- cdp_backend/tests/utils/test_file_utils.py | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cdp_backend/tests/utils/test_file_utils.py b/cdp_backend/tests/utils/test_file_utils.py index 010b310a..581e45ce 100644 --- a/cdp_backend/tests/utils/test_file_utils.py +++ b/cdp_backend/tests/utils/test_file_utils.py @@ -8,6 +8,7 @@ from typing import Optional from unittest import mock +import ffmpeg import imageio import pytest from py._path.local import LocalPath @@ -16,6 +17,7 @@ from cdp_backend.utils.file_utils import ( MAX_THUMBNAIL_HEIGHT, MAX_THUMBNAIL_WIDTH, + caption_is_valid, resource_copy, ) @@ -312,3 +314,26 @@ def test_clip_and_reformat_video( assert outfile.exists() assert outfile == expected_outfile os.remove(outfile) + + +@pytest.mark.parametrize( + "video_uri, caption_uri, end_time, expected", + [ + (EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 120, False), + (EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 60, True), + ], +) +def test_caption_is_valid( + resources_dir: Path, video_uri: str, caption_uri: str, end_time: int, expected: bool +) -> None: + temp_video = "caption-test.mp4" + ffmpeg.input(str(bytes(resources_dir / video_uri), encoding="utf-8")).output( + temp_video, codec="copy", t=end_time + ).run(overwrite_output=True) + + valid = caption_is_valid( + temp_video, + str(bytes(resources_dir / caption_uri), encoding="utf-8"), + ) + os.remove(temp_video) + assert valid == expected From 15ea0485b3acb096729fd7fcda49ffdbecc28302 Mon Sep 17 00:00:00 2001 From: Sung Cho Date: Sat, 24 Sep 2022 09:52:52 -0700 Subject: [PATCH 4/9] handle remote caption uri --- cdp_backend/tests/utils/test_file_utils.py | 17 ++++++++------- cdp_backend/utils/file_utils.py | 25 +++++++++++++++------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/cdp_backend/tests/utils/test_file_utils.py b/cdp_backend/tests/utils/test_file_utils.py index 581e45ce..e92dc91a 100644 --- a/cdp_backend/tests/utils/test_file_utils.py +++ b/cdp_backend/tests/utils/test_file_utils.py @@ -317,23 +317,24 @@ def test_clip_and_reformat_video( @pytest.mark.parametrize( - "video_uri, caption_uri, end_time, expected", + "video_uri, caption_uri, end_time, is_resource, expected", [ - (EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 120, False), - (EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 60, True), + (EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 120, True, False), + (EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 60, True, True), + (EXAMPLE_VIDEO_FILENAME, "https://gist.github.com/dphoria/d3f35b5509b784ccd14b7efdc67df752/raw/c18fc459c62ff7530536ba19d08021682627c18a/sample.vtt", 27, False, True), ], ) def test_caption_is_valid( - resources_dir: Path, video_uri: str, caption_uri: str, end_time: int, expected: bool + resources_dir: Path, video_uri: str, caption_uri: str, end_time: int, is_resource: bool, expected: bool ) -> None: temp_video = "caption-test.mp4" ffmpeg.input(str(bytes(resources_dir / video_uri), encoding="utf-8")).output( temp_video, codec="copy", t=end_time ).run(overwrite_output=True) - valid = caption_is_valid( - temp_video, - str(bytes(resources_dir / caption_uri), encoding="utf-8"), - ) + if is_resource: + caption_uri = str(bytes(resources_dir / caption_uri), encoding="utf-8") + + valid = caption_is_valid(temp_video, caption_uri) os.remove(temp_video) assert valid == expected diff --git a/cdp_backend/utils/file_utils.py b/cdp_backend/utils/file_utils.py index d8e61397..b51f2861 100644 --- a/cdp_backend/utils/file_utils.py +++ b/cdp_backend/utils/file_utils.py @@ -3,10 +3,12 @@ import logging import math +import os import random import re from hashlib import sha256 from pathlib import Path +from tempfile import TemporaryDirectory from typing import Optional, Tuple, Union from uuid import uuid4 @@ -655,13 +657,20 @@ def caption_is_valid(video_uri: str, caption_uri: str) -> bool: try: ffprobe = ffmpeg.probe(video_uri) except ffmpeg.Error as e: - log.warning(f"ffprobe{video_uri}): {e.stderr}") + log.warning(f"ffprobe({video_uri}): {e.stderr}") return False - caption_length = webvtt.read(caption_uri).total_length - similar_audio_streams = filter( - lambda s: s.get("codec_type", "") == "audio" - and math.isclose(float(s.get("duration", "0.0")), caption_length, rel_tol=0.2), - ffprobe.get("streams", []), - ) - return any(similar_audio_streams) + with TemporaryDirectory() as dir_path: + local_caption_path = resource_copy(caption_uri, dst=dir_path) + + try: + caption_length = webvtt.read(local_caption_path).total_length + + similar_audio_streams = filter( + lambda s: s.get("codec_type", "") == "audio" + and math.isclose(float(s.get("duration", "0.0")), caption_length, rel_tol=0.2), + ffprobe.get("streams", []), + ) + return any(similar_audio_streams) + finally: + os.remove(local_caption_path) From bcc94d6f36af1e7cc19096adf9be728d33e7e1ef Mon Sep 17 00:00:00 2001 From: Sung Cho Date: Sat, 24 Sep 2022 10:29:06 -0700 Subject: [PATCH 5/9] formatting --- cdp_backend/tests/utils/test_file_utils.py | 40 +++++++++++++++------- cdp_backend/utils/file_utils.py | 24 ++++++------- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/cdp_backend/tests/utils/test_file_utils.py b/cdp_backend/tests/utils/test_file_utils.py index e92dc91a..ffe12c1e 100644 --- a/cdp_backend/tests/utils/test_file_utils.py +++ b/cdp_backend/tests/utils/test_file_utils.py @@ -5,6 +5,7 @@ import random import sys from pathlib import Path +from tempfile import TemporaryDirectory from typing import Optional from unittest import mock @@ -321,20 +322,33 @@ def test_clip_and_reformat_video( [ (EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 120, True, False), (EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 60, True, True), - (EXAMPLE_VIDEO_FILENAME, "https://gist.github.com/dphoria/d3f35b5509b784ccd14b7efdc67df752/raw/c18fc459c62ff7530536ba19d08021682627c18a/sample.vtt", 27, False, True), + ( + EXAMPLE_VIDEO_FILENAME, + "https://gist.github.com/dphoria/d3f35b5509b784ccd14b7efdc67df752/raw/" + "c18fc459c62ff7530536ba19d08021682627c18a/sample.vtt", + 27, + False, + True, + ), ], ) def test_caption_is_valid( - resources_dir: Path, video_uri: str, caption_uri: str, end_time: int, is_resource: bool, expected: bool + resources_dir: Path, + video_uri: str, + caption_uri: str, + end_time: int, + is_resource: bool, + expected: bool, ) -> None: - temp_video = "caption-test.mp4" - ffmpeg.input(str(bytes(resources_dir / video_uri), encoding="utf-8")).output( - temp_video, codec="copy", t=end_time - ).run(overwrite_output=True) - - if is_resource: - caption_uri = str(bytes(resources_dir / caption_uri), encoding="utf-8") - - valid = caption_is_valid(temp_video, caption_uri) - os.remove(temp_video) - assert valid == expected + with TemporaryDirectory() as dir_path: + temp_video = str( + bytes(Path(dir_path) / f"caption-test-{end_time}.mp4"), encoding="utf-8" + ) + ffmpeg.input(str(bytes(resources_dir / video_uri), encoding="utf-8")).output( + temp_video, codec="copy", t=end_time + ).run(overwrite_output=True) + + if is_resource: + caption_uri = str(bytes(resources_dir / caption_uri), encoding="utf-8") + + assert caption_is_valid(temp_video, caption_uri) == expected diff --git a/cdp_backend/utils/file_utils.py b/cdp_backend/utils/file_utils.py index b51f2861..4bb17f77 100644 --- a/cdp_backend/utils/file_utils.py +++ b/cdp_backend/utils/file_utils.py @@ -3,7 +3,6 @@ import logging import math -import os import random import re from hashlib import sha256 @@ -660,17 +659,16 @@ def caption_is_valid(video_uri: str, caption_uri: str) -> bool: log.warning(f"ffprobe({video_uri}): {e.stderr}") return False + # Making sure temp copy of the caption file is deleted when finished with TemporaryDirectory() as dir_path: local_caption_path = resource_copy(caption_uri, dst=dir_path) - - try: - caption_length = webvtt.read(local_caption_path).total_length - - similar_audio_streams = filter( - lambda s: s.get("codec_type", "") == "audio" - and math.isclose(float(s.get("duration", "0.0")), caption_length, rel_tol=0.2), - ffprobe.get("streams", []), - ) - return any(similar_audio_streams) - finally: - os.remove(local_caption_path) + caption_length = webvtt.read(local_caption_path).total_length + + similar_audio_streams = filter( + lambda s: s.get("codec_type", "") == "audio" + and math.isclose( + float(s.get("duration", "0.0")), caption_length, rel_tol=0.2 + ), + ffprobe.get("streams", []), + ) + return any(similar_audio_streams) From 4db6ad5029fd2b46103da3c29101ef9172ccc38f Mon Sep 17 00:00:00 2001 From: Sung Cho Date: Sat, 24 Sep 2022 10:30:54 -0700 Subject: [PATCH 6/9] formatting --- cdp_backend/tests/utils/test_file_utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cdp_backend/tests/utils/test_file_utils.py b/cdp_backend/tests/utils/test_file_utils.py index ffe12c1e..bc04a7b4 100644 --- a/cdp_backend/tests/utils/test_file_utils.py +++ b/cdp_backend/tests/utils/test_file_utils.py @@ -340,15 +340,16 @@ def test_caption_is_valid( is_resource: bool, expected: bool, ) -> None: + def path_as_str(path: Path) -> str: + return str(bytes(path), encoding="utf-8") + with TemporaryDirectory() as dir_path: - temp_video = str( - bytes(Path(dir_path) / f"caption-test-{end_time}.mp4"), encoding="utf-8" - ) - ffmpeg.input(str(bytes(resources_dir / video_uri), encoding="utf-8")).output( + temp_video = path_as_str(Path(dir_path) / f"caption-test-{end_time}.mp4") + ffmpeg.input(path_as_str(resources_dir / video_uri)).output( temp_video, codec="copy", t=end_time ).run(overwrite_output=True) if is_resource: - caption_uri = str(bytes(resources_dir / caption_uri), encoding="utf-8") + caption_uri = path_as_str(resources_dir / caption_uri) assert caption_is_valid(temp_video, caption_uri) == expected From 557c45730d6c69258639ed270894733a339d1f71 Mon Sep 17 00:00:00 2001 From: Sung Cho Date: Sat, 24 Sep 2022 10:33:40 -0700 Subject: [PATCH 7/9] cleanup --- cdp_backend/tests/utils/test_file_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cdp_backend/tests/utils/test_file_utils.py b/cdp_backend/tests/utils/test_file_utils.py index bc04a7b4..3e9cf222 100644 --- a/cdp_backend/tests/utils/test_file_utils.py +++ b/cdp_backend/tests/utils/test_file_utils.py @@ -320,13 +320,15 @@ def test_clip_and_reformat_video( @pytest.mark.parametrize( "video_uri, caption_uri, end_time, is_resource, expected", [ + # the video is about 3 minutes and boston_captions.vtt is about 1 minute (EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 120, True, False), (EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 60, True, True), ( EXAMPLE_VIDEO_FILENAME, + # about 30 seconds "https://gist.github.com/dphoria/d3f35b5509b784ccd14b7efdc67df752/raw/" "c18fc459c62ff7530536ba19d08021682627c18a/sample.vtt", - 27, + 30, False, True, ), From aaaca48a49d88646784aeb17cfd8870ab1354e3d Mon Sep 17 00:00:00 2001 From: Sung Cho Date: Sat, 24 Sep 2022 13:56:16 -0700 Subject: [PATCH 8/9] catch webvtt exceptions just in case --- cdp_backend/utils/file_utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cdp_backend/utils/file_utils.py b/cdp_backend/utils/file_utils.py index 4bb17f77..e9fa0ae5 100644 --- a/cdp_backend/utils/file_utils.py +++ b/cdp_backend/utils/file_utils.py @@ -17,6 +17,11 @@ import requests import webvtt from fsspec.core import url_to_fs +from webvtt.exceptions import ( + InvalidCaptionsError, + MalformedCaptionError, + MalformedFileError, +) from ..database import models as db_models @@ -662,7 +667,11 @@ def caption_is_valid(video_uri: str, caption_uri: str) -> bool: # Making sure temp copy of the caption file is deleted when finished with TemporaryDirectory() as dir_path: local_caption_path = resource_copy(caption_uri, dst=dir_path) - caption_length = webvtt.read(local_caption_path).total_length + try: + caption_length = webvtt.read(local_caption_path).total_length + except (InvalidCaptionsError, MalformedCaptionError, MalformedFileError) as e: + log.warning(f"webvtt.read({caption_uri}): {str(e)}") + return False similar_audio_streams = filter( lambda s: s.get("codec_type", "") == "audio" From b6d4622794f7a94186525b2a0acccb828eb0ad2b Mon Sep 17 00:00:00 2001 From: Sung Cho Date: Mon, 26 Sep 2022 17:19:08 -0700 Subject: [PATCH 9/9] str(Path) --- cdp_backend/tests/utils/test_file_utils.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cdp_backend/tests/utils/test_file_utils.py b/cdp_backend/tests/utils/test_file_utils.py index 3e9cf222..807d14a7 100644 --- a/cdp_backend/tests/utils/test_file_utils.py +++ b/cdp_backend/tests/utils/test_file_utils.py @@ -342,16 +342,14 @@ def test_caption_is_valid( is_resource: bool, expected: bool, ) -> None: - def path_as_str(path: Path) -> str: - return str(bytes(path), encoding="utf-8") - with TemporaryDirectory() as dir_path: - temp_video = path_as_str(Path(dir_path) / f"caption-test-{end_time}.mp4") - ffmpeg.input(path_as_str(resources_dir / video_uri)).output( + temp_video = str(Path(dir_path) / f"caption-test-{end_time}.mp4") + ffmpeg.input(str(resources_dir / video_uri)).output( temp_video, codec="copy", t=end_time ).run(overwrite_output=True) if is_resource: - caption_uri = path_as_str(resources_dir / caption_uri) + caption_uri = str(resources_dir / caption_uri) + print(temp_video) assert caption_is_valid(temp_video, caption_uri) == expected