Spaces:
Runtime error
Runtime error
| from typing import List, Generator, Tuple | |
| from pathlib import Path | |
| from itertools import islice | |
| import scrapetube | |
| from youtubesearchpython import ChannelsSearch | |
| from pytube import Playlist | |
| from utils import accepts_types | |
| from loading.serialization import Serializer | |
| class YoutubeVideoPreprocessor: | |
| """This class is responsible for creating json files of expected as YoutubeVideo | |
| objects taking a channel name as input. | |
| Each JSON file has the following information: | |
| - channel_name: The name of the YouTube channel | |
| - url: The url of the video | |
| Args: | |
| channel_name (`str`): | |
| The name of the YouTube channel: | |
| Returns: | |
| load_paths (`List[Path]`) | |
| The paths of the json files of the video of that channel. | |
| TODO: Change it to accept also URL of video list, name of video list, etc. | |
| """ | |
| def __init__(self, | |
| mode: str = "channel_name", | |
| serializer = Serializer) -> None: | |
| self.mode = mode | |
| self.serializer = serializer | |
| def preprocess(self, | |
| name: str, | |
| num_videos: int, | |
| videos_in_ds: List[str]) -> Tuple[List[Path], Path]: | |
| if self.mode == "channel_name": | |
| # TODO: Add credits | |
| channels_search = ChannelsSearch(name, limit=1) | |
| channel_id = channels_search.result()['result'][0]['id'] | |
| videos = scrapetube.get_channel(channel_id=channel_id) | |
| load_paths, dataset_folder = self._convert_videos_to_json_files(name, | |
| videos, | |
| num_videos, | |
| videos_in_ds) | |
| return load_paths, dataset_folder | |
| elif self.mode == "playlist": | |
| playlist_id = name.split("=")[-1] | |
| playlist = Playlist(name) | |
| name = playlist.title | |
| videos = scrapetube.get_playlist(playlist_id) | |
| load_paths, dataset_folder = self._convert_videos_to_json_files(name, | |
| videos, | |
| num_videos, | |
| videos_in_ds) | |
| return load_paths, dataset_folder | |
| else: | |
| # TODO: implement this part | |
| youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber" | |
| test_files_folder = youtube_folder/"test/files" | |
| return [Path("test.json"), Path("test1.json")], test_files_folder | |
| def _convert_videos_to_json_files(self, | |
| name:str, | |
| videos: Generator, | |
| num_videos: int, | |
| videos_in_ds: List[str]) -> Tuple[List[Path], Path]: | |
| load_paths = [] | |
| youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber" | |
| dataset_folder = youtube_folder/name | |
| Path(dataset_folder).mkdir(parents=True, exist_ok=True) | |
| i = 0 | |
| while i < num_videos: | |
| try: | |
| video = next(videos) | |
| if video["videoId"] in videos_in_ds: | |
| continue | |
| else: | |
| file_name = f"{i}.json" | |
| save_path = Path(dataset_folder, file_name) | |
| save_path.touch(exist_ok=True) | |
| video_dict = {"channel_name": name, | |
| "url":f"https://www.youtube.com/watch?v={video['videoId']}"} | |
| self.serializer.dump(obj=video_dict, save_path=save_path) | |
| load_paths.append(save_path) | |
| i += 1 | |
| except StopIteration: | |
| break | |
| return load_paths, dataset_folder |