Skip to content

Commit

Permalink
feature/filter-out-invalid-captions (#214)
Browse files Browse the repository at this point in the history
* caption_is_valid

* addition caption validation

* test

* handle remote caption uri

* formatting

* formatting

* cleanup

* catch webvtt exceptions just in case

* str(Path)
  • Loading branch information
dphoria committed Sep 27, 2022
1 parent ab3a65d commit 1f66f12
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 1 deletion.
6 changes: 5 additions & 1 deletion cdp_backend/pipeline/event_gather_pipeline.py
Expand Up @@ -160,7 +160,11 @@ def create_event_gather_flow(
if session.caption_uri is not None:
# If the caption doesn't exist, remove the property
# This will result in Speech-to-Text being used instead
if not resource_exists(session.caption_uri):
if not resource_exists(
session.caption_uri
) or not file_utils.caption_is_valid(
tmp_video_filepath, session.caption_uri
):
log.warning(
f"File not found using provided caption URI: "
f"'{session.caption_uri}'. "
Expand Down
41 changes: 41 additions & 0 deletions cdp_backend/tests/utils/test_file_utils.py
Expand Up @@ -5,9 +5,11 @@
import random
import sys
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Optional
from unittest import mock

import ffmpeg
import imageio
import pytest
from py._path.local import LocalPath
Expand All @@ -16,6 +18,7 @@
from cdp_backend.utils.file_utils import (
MAX_THUMBNAIL_HEIGHT,
MAX_THUMBNAIL_WIDTH,
caption_is_valid,
resource_copy,
)

Expand Down Expand Up @@ -312,3 +315,41 @@ def test_clip_and_reformat_video(
assert outfile.exists()
assert outfile == expected_outfile
os.remove(outfile)


@pytest.mark.parametrize(
"video_uri, caption_uri, end_time, is_resource, expected",
[
# the video is about 3 minutes and boston_captions.vtt is about 1 minute
(EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 120, True, False),
(EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 60, True, True),
(
EXAMPLE_VIDEO_FILENAME,
# about 30 seconds
"https://gist.github.com/dphoria/d3f35b5509b784ccd14b7efdc67df752/raw/"
"c18fc459c62ff7530536ba19d08021682627c18a/sample.vtt",
30,
False,
True,
),
],
)
def test_caption_is_valid(
resources_dir: Path,
video_uri: str,
caption_uri: str,
end_time: int,
is_resource: bool,
expected: bool,
) -> None:
with TemporaryDirectory() as dir_path:
temp_video = str(Path(dir_path) / f"caption-test-{end_time}.mp4")
ffmpeg.input(str(resources_dir / video_uri)).output(
temp_video, codec="copy", t=end_time
).run(overwrite_output=True)

if is_resource:
caption_uri = str(resources_dir / caption_uri)

print(temp_video)
assert caption_is_valid(temp_video, caption_uri) == expected
55 changes: 55 additions & 0 deletions cdp_backend/utils/file_utils.py
Expand Up @@ -7,13 +7,21 @@
import re
from hashlib import sha256
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Optional, Tuple, Union
from uuid import uuid4

import ffmpeg
import fireo
import fsspec
import requests
import webvtt
from fsspec.core import url_to_fs
from webvtt.exceptions import (
InvalidCaptionsError,
MalformedCaptionError,
MalformedFileError,
)

from ..database import models as db_models

Expand Down Expand Up @@ -626,3 +634,50 @@ def clip_and_reformat_video(
log.error(ffmpeg_stderr)

return output_path


def caption_is_valid(video_uri: str, caption_uri: str) -> bool:
"""
Validate the caption file at the URI provided.
Parameters
----------
video_uri: str
The URI for the the target video.
uri: str
The URI to validate caption file for.
Returns
-------
status: bool
The validation status.
Notes
-----
Duration of the video at video_uri
and the duration of the caption file are compared.
The caption file is accepted if the durations differ by no more than 20%.
"""
try:
ffprobe = ffmpeg.probe(video_uri)
except ffmpeg.Error as e:
log.warning(f"ffprobe({video_uri}): {e.stderr}")
return False

# Making sure temp copy of the caption file is deleted when finished
with TemporaryDirectory() as dir_path:
local_caption_path = resource_copy(caption_uri, dst=dir_path)
try:
caption_length = webvtt.read(local_caption_path).total_length
except (InvalidCaptionsError, MalformedCaptionError, MalformedFileError) as e:
log.warning(f"webvtt.read({caption_uri}): {str(e)}")
return False

similar_audio_streams = filter(
lambda s: s.get("codec_type", "") == "audio"
and math.isclose(
float(s.get("duration", "0.0")), caption_length, rel_tol=0.2
),
ffprobe.get("streams", []),
)
return any(similar_audio_streams)

0 comments on commit 1f66f12

Please sign in to comment.