Skip to content

Commit

Permalink
feature/add-video-trimming-via-transcription-ranges-to-session (#221)
Browse files Browse the repository at this point in the history
* Add transcription range fields to database and ingestion models, add validator for time duration, add range filter to ffmpeg audio split, update tests

* Add tests for edge case transcription ranges

* Fix end_time=start_time typo, update tests to be more concise, add ffmpeg error logging

* Updated transcription range to video range, updated video handling to host when limiting video to a range, updated mp4 conversion to allow a range, connected mp4 to clip functionality, updated tests and tried to make testing slightly more consistent, added Session ingestion verification to test out

* Update session hash to reflect trimmed video

* Bypass hash task

* Remove unnecessary logging, duration validation comments, elif typo fix in cdp_will_host control structure

* Reverted function parameter doc for split audio

* Improved documentation for video_start_time in ingestion_models
  • Loading branch information
chrisjkhan committed Dec 6, 2022
1 parent 2d080b4 commit b00f378
Show file tree
Hide file tree
Showing 9 changed files with 240 additions and 60 deletions.
4 changes: 4 additions & 0 deletions cdp_backend/database/models.py
Expand Up @@ -494,6 +494,8 @@ class Session(Model):
session_index = fields.NumberField(required=True)
session_content_hash = fields.TextField(required=True)
video_uri = fields.TextField(required=True, validator=validators.resource_exists)
video_start_time = fields.TextField(validators.time_duration_is_valid)
video_end_time = fields.TextField(validators.time_duration_is_valid)
caption_uri = fields.TextField(validator=validators.resource_exists)
external_source_id = fields.TextField()

Expand All @@ -515,6 +517,8 @@ def Example(cls) -> Model:
session.video_uri = (
"https://video.seattle.gov/media/council/brief_072219_2011957V.mp4"
)
session.video_start_time = "01:00:00"
session.video_end_time = "99:59:59"
session.session_content_hash = (
"05bd857af7f70bf51b6aac1144046973bf3325c9101a554bc27dc9607dbbd8f5"
)
Expand Down
28 changes: 28 additions & 0 deletions cdp_backend/database/validators.py
Expand Up @@ -72,6 +72,34 @@ def router_string_is_valid(router_string: Optional[str]) -> bool:
return False


def time_duration_is_valid(time_duration: Optional[str]) -> bool:
"""
Validate that the provided time duration string is acceptable to FFmpeg.
The validator is unnecessarily limited to HH:MM:SS. The spec is a little
more flexible.
None is a valid option.
Parameters
----------
time_duration: Optional[str]
The time duration to validate.
Returns
-------
status: bool
The validation status.
"""
if time_duration is None:
return True

# HH:MM:SS
if re.match(r"^((((\d{1,2}:)?[0-5])?\d:)?[0-5])?\d$", time_duration):
return True

return False


def email_is_valid(email: Optional[str]) -> bool:
"""
Validate that a valid email was provided.
Expand Down
45 changes: 32 additions & 13 deletions cdp_backend/pipeline/event_gather_pipeline.py
Expand Up @@ -130,18 +130,13 @@ def create_event_gather_flow(
# Download video to local copy
resource_copy_filepath = resource_copy_task(uri=session.video_uri)

# Get unique session identifier
session_content_hash = get_session_content_hash(
tmp_video_filepath=resource_copy_filepath,
)

# Handle video conversion or non-secure resource
# hosting
(
tmp_video_filepath,
session_video_hosted_url,
session_content_hash,
) = convert_video_and_handle_host(
session_content_hash=session_content_hash,
video_filepath=resource_copy_filepath,
session=session,
credentials_file=config.google_credentials_file,
Expand Down Expand Up @@ -293,14 +288,13 @@ def get_session_content_hash(
return file_utils.hash_file_contents(uri=tmp_video_filepath)


@task(nout=2)
@task(nout=3)
def convert_video_and_handle_host(
session_content_hash: str,
video_filepath: str,
session: Session,
credentials_file: str,
bucket: str,
) -> Tuple[str, str]:
) -> Tuple[str, str, str]:
"""
Convert a video to MP4 (if necessary), upload it to the file store, and remove
the original non-MP4 file that was resource copied.
Expand Down Expand Up @@ -330,19 +324,41 @@ def convert_video_and_handle_host(
# Get file extension
ext = Path(video_filepath).suffix.lower()

trim_video = bool(session.video_start_time or session.video_end_time)

# Convert to mp4 if file isn't of approved web format
cdp_will_host = False
if ext not in [".mp4", ".webm"]:
cdp_will_host = True

# Convert video to mp4
mp4_filepath = file_utils.convert_video_to_mp4(video_filepath)
mp4_filepath = file_utils.convert_video_to_mp4(
video_filepath=Path(video_filepath),
start_time=session.video_start_time,
end_time=session.video_end_time,
)

# Remove old mkv file
fs_functions.remove_local_file(video_filepath)

# Update variable name for easier downstream typing
video_filepath = mp4_filepath
video_filepath = str(mp4_filepath)

# host trimmed videos because it's simpler than setting
# up transcription and playback ranges
elif trim_video:
cdp_will_host = True

# Trim video
trimmed_filepath = file_utils.clip_and_reformat_video(
video_filepath=Path(video_filepath),
start_time=session.video_start_time,
end_time=session.video_end_time,
)

fs_functions.remove_local_file(video_filepath)

# Update variable name for easier downstream typing
video_filepath = str(trimmed_filepath)

# Check if original session video uri is a m3u8
# We cant follow the normal coonvert video process from above
Expand Down Expand Up @@ -370,6 +386,9 @@ def convert_video_and_handle_host(
else:
hosted_video_media_url = session.video_uri

# Get unique session identifier
session_content_hash = file_utils.hash_file_contents(uri=video_filepath)

# Upload and swap if cdp is hosting
if cdp_will_host:
# Upload to gcsfs
Expand All @@ -387,7 +406,7 @@ def convert_video_and_handle_host(
uri=hosted_video_uri,
)

return video_filepath, hosted_video_media_url
return video_filepath, hosted_video_media_url, session_content_hash


@task
Expand Down
27 changes: 27 additions & 0 deletions cdp_backend/pipeline/ingestion_models.py
Expand Up @@ -134,14 +134,39 @@ class Session(IngestionModel, DataClassJsonMixin):
"""
A session is a working period for an event.
For example, an event could have a morning and afternoon session.
Notes
-----
video_start_time is a duration relative to the beginning of the video in
HH:MM:SS format. It does not affect nor is relative to session_datetime
or any other datetime. If the portion of the video relavent to the session
begins 37m50s into the full video, video_start_time will be "37:50".
An absent start time is equivalent to the beginning of the video, and an
absent end time is equivalent to the end of the video, so either can be omitted.
"""

session_datetime: datetime
video_uri: str
session_index: int
video_start_time: Optional[str] = None
video_end_time: Optional[str] = None
caption_uri: Optional[str] = None
external_source_id: Optional[str] = None

def __post_init__(self) -> None:
# validate start/end time pair during ingestion
if self.video_start_time and self.video_end_time:
# fill in potentially missing hh:mm:s
# for flexible input format [h[h:[m[m:[s]]]]]s
start = list(map(int, ("00:00:0" + self.video_start_time).split(":")))
end = list(map(int, ("00:00:0" + self.video_end_time).split(":")))
start.reverse()
end.reverse()
start_seconds = start[0] + start[1] * 60 + start[2] * 3600
end_seconds = end[0] + end[1] * 60 + end[2] * 3600
if start_seconds >= end_seconds:
raise ValueError("start_time must be less than end_time if both exist")


@dataclass
class Body(IngestionModel, DataClassJsonMixin):
Expand Down Expand Up @@ -263,6 +288,8 @@ class EventIngestionModel(IngestionModel, DataClassJsonMixin):
video_uri=(
"https://video.seattle.gov/media/council/council_113020_2022091V.mp4"
),
video_start_time=("00:00:00"),
video_end_time=("99:59:59"),
caption_uri=(
"https://www.seattlechannel.org/documents/seattlechannel/closedcaption/2020/council_113020_2022091.vtt" # noqa: E501
),
Expand Down
17 changes: 16 additions & 1 deletion cdp_backend/pipeline/mock_get_events.py
Expand Up @@ -54,20 +54,33 @@
(
"https://video.seattle.gov/media/council/council_010421_2022101V.mp4",
"https://www.seattlechannel.org/documents/seattlechannel/closedcaption/2021/council_010421_2022101.vtt", # noqa
None,
None,
),
(
"https://video.seattle.gov/media/council/council_113020_2022091V.mp4",
"https://www.seattlechannel.org/documents/seattlechannel/closedcaption/2020/council_113020_2022091.vtt", # noqa
"1",
"25:25",
),
(
"https://video.seattle.gov/media/council/council_112320_2022089V.mp4",
"https://www.seattlechannel.org/documents/seattlechannel/closedcaption/2020/brief_112320_2012089.vtt", # noqa
None,
"2:58:14",
),
(
"https://video.seattle.gov/media/council/council_110920_2022085V.mp4",
"https://www.seattlechannel.org/documents/seattlechannel/closedcaption/2020/council_110920_2022085.vtt", # noqa
"1",
None,
),
(
"https://video.seattle.gov/media/council/council_101220_2022077V.mp4",
None,
None,
None,
),
("https://video.seattle.gov/media/council/council_101220_2022077V.mp4", None),
]


Expand Down Expand Up @@ -121,6 +134,8 @@ def _get_example_event() -> EventIngestionModel:
session_datetime=datetime.utcnow() + (i * timedelta(hours=3)),
session_index=i,
video_uri=session[0],
video_start_time=session[2],
video_end_time=session[3],
caption_uri=session[1],
)
for i, session in enumerate(random.sample(SESSIONS, random.randint(1, 3)))
Expand Down
33 changes: 33 additions & 0 deletions cdp_backend/tests/database/test_validators.py
Expand Up @@ -48,6 +48,39 @@ def test_router_string_is_valid(router_string: str, expected_result: bool) -> No
assert actual_result == expected_result


@pytest.mark.parametrize(
"time_duration, expected_result",
[
(None, True),
("1", True),
("11", True),
("1:11", True),
("11:11", True),
("1:11:11", True),
("99:59:59", True),
("0", True),
("00", True),
("0:00", True),
("00:00", True),
("0:00:00", True),
("00:00:00", True),
("111", False),
("11:1", False),
("111:11", False),
("11:1:11", False),
("11:11:1", False),
("111:11:11", False),
("60", False),
("60:00", False),
("1:60:00", False),
("1:00:60", False),
],
)
def test_time_duration_is_valid(time_duration: str, expected_result: bool) -> None:
actual_result = validators.time_duration_is_valid(time_duration)
assert actual_result == expected_result


@pytest.mark.parametrize(
"email, expected_result",
[
Expand Down
5 changes: 4 additions & 1 deletion cdp_backend/tests/pipeline/test_event_gather_pipeline.py
Expand Up @@ -580,6 +580,7 @@ def test_store_event_processing_results(
@mock.patch(f"{PIPELINE_PATH}.fs_functions.upload_file")
@mock.patch(f"{PIPELINE_PATH}.fs_functions.get_open_url_for_gcs_file")
@mock.patch(f"{PIPELINE_PATH}.fs_functions.remove_local_file")
@mock.patch(f"{PIPELINE_PATH}.file_utils.hash_file_contents")
@mock.patch(f"{PIPELINE_PATH}.file_utils.convert_video_to_mp4")
@pytest.mark.parametrize(
"video_filepath, session, expected_filepath, expected_hosted_video_url",
Expand Down Expand Up @@ -618,6 +619,7 @@ def test_store_event_processing_results(
)
def test_convert_video_and_handle_host(
mock_convert_video_to_mp4: MagicMock,
mock_hash_file_contents: MagicMock,
mock_remove_local_file: MagicMock,
mock_generate_url: MagicMock,
mock_upload_file: MagicMock,
Expand All @@ -629,12 +631,13 @@ def test_convert_video_and_handle_host(
mock_upload_file.return_value = "file_store_uri"
mock_generate_url.return_value = "hosted-video.mp4"
mock_convert_video_to_mp4.return_value = expected_filepath
mock_hash_file_contents.return_value = "abc123"

(
mp4_filepath,
session_video_hosted_url,
session_content_hash,
) = pipeline.convert_video_and_handle_host.run(
session_content_hash="abc123",
video_filepath=video_filepath,
session=session,
credentials_file="fake/credentials.json",
Expand Down

0 comments on commit b00f378

Please sign in to comment.