Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/filter-out-invalid-captions #214

Merged
merged 9 commits into from Sep 27, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 5 additions & 1 deletion cdp_backend/pipeline/event_gather_pipeline.py
Expand Up @@ -160,7 +160,11 @@ def create_event_gather_flow(
if session.caption_uri is not None:
# If the caption doesn't exist, remove the property
# This will result in Speech-to-Text being used instead
if not resource_exists(session.caption_uri):
if not resource_exists(
session.caption_uri
) or not file_utils.caption_is_valid(
dphoria marked this conversation as resolved.
Show resolved Hide resolved
tmp_video_filepath, session.caption_uri
):
log.warning(
f"File not found using provided caption URI: "
f"'{session.caption_uri}'. "
Expand Down
25 changes: 25 additions & 0 deletions cdp_backend/tests/utils/test_file_utils.py
Expand Up @@ -8,6 +8,7 @@
from typing import Optional
from unittest import mock

import ffmpeg
import imageio
import pytest
from py._path.local import LocalPath
Expand All @@ -16,6 +17,7 @@
from cdp_backend.utils.file_utils import (
MAX_THUMBNAIL_HEIGHT,
MAX_THUMBNAIL_WIDTH,
caption_is_valid,
resource_copy,
)

Expand Down Expand Up @@ -312,3 +314,26 @@ def test_clip_and_reformat_video(
assert outfile.exists()
assert outfile == expected_outfile
os.remove(outfile)


@pytest.mark.parametrize(
"video_uri, caption_uri, end_time, expected",
[
(EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 120, False),
(EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 60, True),
],
)
def test_caption_is_valid(
resources_dir: Path, video_uri: str, caption_uri: str, end_time: int, expected: bool
) -> None:
temp_video = "caption-test.mp4"
dphoria marked this conversation as resolved.
Show resolved Hide resolved
ffmpeg.input(str(bytes(resources_dir / video_uri), encoding="utf-8")).output(
temp_video, codec="copy", t=end_time
).run(overwrite_output=True)

valid = caption_is_valid(
temp_video,
str(bytes(resources_dir / caption_uri), encoding="utf-8"),
)
os.remove(temp_video)
assert valid == expected
39 changes: 39 additions & 0 deletions cdp_backend/utils/file_utils.py
Expand Up @@ -10,9 +10,11 @@
from typing import Optional, Tuple, Union
from uuid import uuid4

import ffmpeg
import fireo
import fsspec
import requests
import webvtt
from fsspec.core import url_to_fs

from ..database import models as db_models
Expand Down Expand Up @@ -626,3 +628,40 @@ def clip_and_reformat_video(
log.error(ffmpeg_stderr)

return output_path


def caption_is_valid(video_uri: str, caption_uri: str) -> bool:
"""
Validate the caption file at the URI provided.

Parameters
----------
video_uri: str
The URI for the the target video.
uri: str
The URI to validate caption file for.

Returns
-------
status: bool
The validation status.

Notes
-----
Duration of the video at video_uri
and the duration of the caption file are compared.
The caption file is accepted if the durations differ by no more than 20%.
"""
try:
ffprobe = ffmpeg.probe(video_uri)
except ffmpeg.Error as e:
log.warning(f"ffprobe{video_uri}): {e.stderr}")
dphoria marked this conversation as resolved.
Show resolved Hide resolved
return False

caption_length = webvtt.read(caption_uri).total_length
similar_audio_streams = filter(
lambda s: s.get("codec_type", "") == "audio"
and math.isclose(float(s.get("duration", "0.0")), caption_length, rel_tol=0.2),
ffprobe.get("streams", []),
)
return any(similar_audio_streams)