Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/filter-out-invalid-captions #214

Merged
merged 9 commits into from Sep 27, 2022
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 5 additions & 1 deletion cdp_backend/pipeline/event_gather_pipeline.py
Expand Up @@ -160,7 +160,11 @@ def create_event_gather_flow(
if session.caption_uri is not None:
# If the caption doesn't exist, remove the property
# This will result in Speech-to-Text being used instead
if not resource_exists(session.caption_uri):
if not resource_exists(
session.caption_uri
) or not file_utils.caption_is_valid(
dphoria marked this conversation as resolved.
Show resolved Hide resolved
tmp_video_filepath, session.caption_uri
):
log.warning(
f"File not found using provided caption URI: "
f"'{session.caption_uri}'. "
Expand Down
43 changes: 43 additions & 0 deletions cdp_backend/tests/utils/test_file_utils.py
Expand Up @@ -5,9 +5,11 @@
import random
import sys
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Optional
from unittest import mock

import ffmpeg
import imageio
import pytest
from py._path.local import LocalPath
Expand All @@ -16,6 +18,7 @@
from cdp_backend.utils.file_utils import (
MAX_THUMBNAIL_HEIGHT,
MAX_THUMBNAIL_WIDTH,
caption_is_valid,
resource_copy,
)

Expand Down Expand Up @@ -312,3 +315,43 @@ def test_clip_and_reformat_video(
assert outfile.exists()
assert outfile == expected_outfile
os.remove(outfile)


@pytest.mark.parametrize(
"video_uri, caption_uri, end_time, is_resource, expected",
[
# the video is about 3 minutes and boston_captions.vtt is about 1 minute
(EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 120, True, False),
(EXAMPLE_VIDEO_FILENAME, "boston_captions.vtt", 60, True, True),
(
EXAMPLE_VIDEO_FILENAME,
# about 30 seconds
"https://gist.github.com/dphoria/d3f35b5509b784ccd14b7efdc67df752/raw/"
"c18fc459c62ff7530536ba19d08021682627c18a/sample.vtt",
30,
False,
True,
),
],
)
def test_caption_is_valid(
resources_dir: Path,
video_uri: str,
caption_uri: str,
end_time: int,
is_resource: bool,
expected: bool,
) -> None:
def path_as_str(path: Path) -> str:
return str(bytes(path), encoding="utf-8")
dphoria marked this conversation as resolved.
Show resolved Hide resolved

with TemporaryDirectory() as dir_path:
temp_video = path_as_str(Path(dir_path) / f"caption-test-{end_time}.mp4")
ffmpeg.input(path_as_str(resources_dir / video_uri)).output(
temp_video, codec="copy", t=end_time
).run(overwrite_output=True)

if is_resource:
caption_uri = path_as_str(resources_dir / caption_uri)

assert caption_is_valid(temp_video, caption_uri) == expected
55 changes: 55 additions & 0 deletions cdp_backend/utils/file_utils.py
Expand Up @@ -7,13 +7,21 @@
import re
from hashlib import sha256
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Optional, Tuple, Union
from uuid import uuid4

import ffmpeg
import fireo
import fsspec
import requests
import webvtt
from fsspec.core import url_to_fs
from webvtt.exceptions import (
InvalidCaptionsError,
MalformedCaptionError,
MalformedFileError,
)

from ..database import models as db_models

Expand Down Expand Up @@ -626,3 +634,50 @@ def clip_and_reformat_video(
log.error(ffmpeg_stderr)

return output_path


def caption_is_valid(video_uri: str, caption_uri: str) -> bool:
"""
Validate the caption file at the URI provided.

Parameters
----------
video_uri: str
The URI for the the target video.
uri: str
The URI to validate caption file for.

Returns
-------
status: bool
The validation status.

Notes
-----
Duration of the video at video_uri
and the duration of the caption file are compared.
The caption file is accepted if the durations differ by no more than 20%.
"""
try:
ffprobe = ffmpeg.probe(video_uri)
except ffmpeg.Error as e:
log.warning(f"ffprobe({video_uri}): {e.stderr}")
return False

# Making sure temp copy of the caption file is deleted when finished
with TemporaryDirectory() as dir_path:
local_caption_path = resource_copy(caption_uri, dst=dir_path)
try:
caption_length = webvtt.read(local_caption_path).total_length
except (InvalidCaptionsError, MalformedCaptionError, MalformedFileError) as e:
log.warning(f"webvtt.read({caption_uri}): {str(e)}")
return False

similar_audio_streams = filter(
lambda s: s.get("codec_type", "") == "audio"
and math.isclose(
float(s.get("duration", "0.0")), caption_length, rel_tol=0.2
),
dphoria marked this conversation as resolved.
Show resolved Hide resolved
ffprobe.get("streams", []),
)
return any(similar_audio_streams)