diff --git a/.gitignore b/.gitignore index f8b73e7..f06f1b3 100644 --- a/.gitignore +++ b/.gitignore @@ -137,4 +137,4 @@ dmypy.json # Cython debug symbols cython_debug/ - +.vscode/ diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index 8532091..0000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - // 使用 IntelliSense 了解相关属性。 - // 悬停以查看现有属性的描述。 - // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "name": "init channel", - "type": "debugpy", - "request": "launch", - "program": "init_channel.py", - "console": "integratedTerminal" - }, - { - "name": "move_data", - "type": "debugpy", - "request": "launch", - "program": "move_data.py", - "console": "integratedTerminal" - }, - { - "name": "search video", - "type": "debugpy", - "request": "launch", - "program": "search_video.py", - "console": "integratedTerminal", - "args": ["--start", "2023-09-10T00:00:01Z", - "--end", "2023-09-11T00:00:01Z"] - }, - { - "name": "download video", - "type": "debugpy", - "request": "launch", - "program": "download_video.py", - "console": "integratedTerminal" - }, - { - "name": "test", - "type": "debugpy", - "request": "launch", - "program": "test.py", - "console": "integratedTerminal" - }, - ] -} \ No newline at end of file diff --git a/parse_video.py b/parse_video.py new file mode 100644 index 0000000..4bc09d0 --- /dev/null +++ b/parse_video.py @@ -0,0 +1,121 @@ +import os +import time +from LoggerUtils import Logger, initLogger +from bs4 import BeautifulSoup as bs +from urllib.request import urlopen, Request +import json +import Contant +from sqlalchemy import create_engine +from entity.DownloadInfoEntity import DownloadInfo +from entity.VideoEntity import Video +from entity.ChannelEntity import Channel +from service.DownloadInfoService import DownloadInfoService +from service.VideoService import VideoService +from common.YoutubeUtils import YouTubeUtil +from common.DownloadUtils import DownloadUtil +from service.ChannelService import ChannelService +import operator +import argparse +import difflib +from shutil import copyfile + + +def get_all_files(directory): + """ + 递归获取目录下所有文件的路径 + :param directory: 目录路径 + :return: 文件路径列表 + """ + files = [] + for root, dirs, filenames in os.walk(directory): + for filename in filenames: + # 将文件的完整路径添加到列表中 + files.append(filename) + return files + + +def getSrtFileName(video: Video): + videoTitle = video.videoTitle + videoTitle = videoTitle.replace("/", u"\u2215") + videoTitle = videoTitle.replace("?", "?") + videoTitle = videoTitle.replace("\\", "") + videoTitle = videoTitle.replace("|", "") + videoTitle = videoTitle.replace("<", "") + videoTitle = videoTitle.replace(">", "") + videoTitle = videoTitle.replace(":", "") + videoPublishTime = str(video.videoPublishTime) + videoPublishTime = str(videoPublishTime).split("T")[0] + languages = str(video.videoLanguage) + srtfileName = f'{videoPublishTime}-{languages}-{videoTitle}.srt' + return srtfileName + + +def get_equal_rate_1(str1, str2): + return difflib.SequenceMatcher(None, str1, str2).quick_ratio() + + +if __name__ == "__main__": + # 读取配置文件 + with open('parse_video_config.json', 'r', encoding='utf-8') as f: + # 使用json.load()方法读取文件内容 + data = json.load(f) + + # 初始化日志 + Contant.logDir = data['log']['dir'] + Contant.logFileName = data['log']['fileName'] + initLogger(Contant.logDir, Contant.logFileName) + + # 连接mysql + dbHost = data['mysql']['host'] + dbPort = data['mysql']['port'] + dbUserName = data['mysql']['username'] + dbPassword = data['mysql']['password'] + dbDatabase = data['mysql']['database'] + Logger.info("尝试连接mysql host:'{}' port:'{}' username:'{}' password:'{}' database:'{}'", + dbHost, dbPort, dbUserName, dbPassword, dbDatabase) + Contant.engin = create_engine( + f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}') + Logger.info("连接mysql成功") + + parseRoot = data['parse_root'] + newSrtPaht = data['new_srt_path'] + + Logger.info(f'parseRoot: {parseRoot}') + chanelNameList = [] + for root, dirs, filenames in os.walk(parseRoot): + channelName = str(root).replace(f"{parseRoot}/", "") + chanelNameList.append(channelName) + + # 获取所有Channel + channel_dict = {} + channels = ChannelService.queryAllChannel() + for i in range(len(channels)): + channel: Channel = channels[i] + channelTitle = channel.channelTitle + for channelName in chanelNameList: + if get_equal_rate_1(channelTitle, channelName) > 0.9: + channel_dict[str(channel.channelId) + ] = f"{parseRoot}/{channelName}" + + # 遍历channel_dict,复制字幕文件 + for key, value in channel_dict.items(): + channel: Channel = ChannelService.queryOneByChannelId(key) + videos = VideoService.queryAllbyChannelId(key) + Logger.info(f"key: {key} len: {len(videos)}") + for i in range(len(videos)): + video: Video = videos[i] + srtFileName = getSrtFileName(video=video) + for root, dirs, filenames in os.walk(value): + for filename in filenames: + if get_equal_rate_1(srtFileName, filename) > 0.8: + src_path = f"{root}/{filename}" + dst_path = f"{newSrtPaht}/{channel.region}/{channel.channelId}-{channel.channelTitle}" + if not os.path.exists(dst_path): + Logger.info("开始创建文件夹:" + dst_path) + os.makedirs(dst_path) + dst_path = f"{dst_path}/{video.videoId}.srt" + Logger.info(f"src_path:{src_path} dst_path:{dst_path}") + copyfile(src_path, dst_path) + # 并且读取srt文件到数据库 + DownloadUtil.iterateSrt( + srtFilePath=dst_path, videoId=video.videoId, channelId=channel.channelId) diff --git a/parse_video_config.json b/parse_video_config.json new file mode 100644 index 0000000..23e6d56 --- /dev/null +++ b/parse_video_config.json @@ -0,0 +1,15 @@ +{ + "mysql": { + "host": "47.108.20.249", + "port": "3306", + "username": "root", + "password": "casino888!", + "database": "youtube" + }, + "log": { + "dir": "./logs", + "fileName": "parse_video" + }, + "parse_root": "E:/code/python/tmp_srt_file", + "new_srt_path": "/mnt/new_srt_path" +} \ No newline at end of file diff --git a/service/VideoService.py b/service/VideoService.py index 54c03c5..5019a2f 100644 --- a/service/VideoService.py +++ b/service/VideoService.py @@ -12,6 +12,13 @@ class VideoService: session.close() return video + def queryAllbyChannelId(channelId): + session = getSession() + videos = session.query(Video).filter( + Video.channelId == channelId).all() + session.close() + return videos + def insertOne(videoId, ChannelId, videoTitle, videoLen, videoType, videoPublishTime, videoLanguage, isDownload): session = getSession() video: Video = Video(videoId=videoId, ChannelId=ChannelId, videoTitle=videoTitle,