zhangshu
5 months ago
5 changed files with 144 additions and 46 deletions
@ -1,45 +0,0 @@ |
|||||
{ |
|
||||
// 使用 IntelliSense 了解相关属性。 |
|
||||
// 悬停以查看现有属性的描述。 |
|
||||
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 |
|
||||
"version": "0.2.0", |
|
||||
"configurations": [ |
|
||||
{ |
|
||||
"name": "init channel", |
|
||||
"type": "debugpy", |
|
||||
"request": "launch", |
|
||||
"program": "init_channel.py", |
|
||||
"console": "integratedTerminal" |
|
||||
}, |
|
||||
{ |
|
||||
"name": "move_data", |
|
||||
"type": "debugpy", |
|
||||
"request": "launch", |
|
||||
"program": "move_data.py", |
|
||||
"console": "integratedTerminal" |
|
||||
}, |
|
||||
{ |
|
||||
"name": "search video", |
|
||||
"type": "debugpy", |
|
||||
"request": "launch", |
|
||||
"program": "search_video.py", |
|
||||
"console": "integratedTerminal", |
|
||||
"args": ["--start", "2023-09-10T00:00:01Z", |
|
||||
"--end", "2023-09-11T00:00:01Z"] |
|
||||
}, |
|
||||
{ |
|
||||
"name": "download video", |
|
||||
"type": "debugpy", |
|
||||
"request": "launch", |
|
||||
"program": "download_video.py", |
|
||||
"console": "integratedTerminal" |
|
||||
}, |
|
||||
{ |
|
||||
"name": "test", |
|
||||
"type": "debugpy", |
|
||||
"request": "launch", |
|
||||
"program": "test.py", |
|
||||
"console": "integratedTerminal" |
|
||||
}, |
|
||||
] |
|
||||
} |
|
@ -0,0 +1,121 @@ |
|||||
|
import os |
||||
|
import time |
||||
|
from LoggerUtils import Logger, initLogger |
||||
|
from bs4 import BeautifulSoup as bs |
||||
|
from urllib.request import urlopen, Request |
||||
|
import json |
||||
|
import Contant |
||||
|
from sqlalchemy import create_engine |
||||
|
from entity.DownloadInfoEntity import DownloadInfo |
||||
|
from entity.VideoEntity import Video |
||||
|
from entity.ChannelEntity import Channel |
||||
|
from service.DownloadInfoService import DownloadInfoService |
||||
|
from service.VideoService import VideoService |
||||
|
from common.YoutubeUtils import YouTubeUtil |
||||
|
from common.DownloadUtils import DownloadUtil |
||||
|
from service.ChannelService import ChannelService |
||||
|
import operator |
||||
|
import argparse |
||||
|
import difflib |
||||
|
from shutil import copyfile |
||||
|
|
||||
|
|
||||
|
def get_all_files(directory): |
||||
|
""" |
||||
|
递归获取目录下所有文件的路径 |
||||
|
:param directory: 目录路径 |
||||
|
:return: 文件路径列表 |
||||
|
""" |
||||
|
files = [] |
||||
|
for root, dirs, filenames in os.walk(directory): |
||||
|
for filename in filenames: |
||||
|
# 将文件的完整路径添加到列表中 |
||||
|
files.append(filename) |
||||
|
return files |
||||
|
|
||||
|
|
||||
|
def getSrtFileName(video: Video): |
||||
|
videoTitle = video.videoTitle |
||||
|
videoTitle = videoTitle.replace("/", u"\u2215") |
||||
|
videoTitle = videoTitle.replace("?", "?") |
||||
|
videoTitle = videoTitle.replace("\\", "") |
||||
|
videoTitle = videoTitle.replace("|", "") |
||||
|
videoTitle = videoTitle.replace("<", "") |
||||
|
videoTitle = videoTitle.replace(">", "") |
||||
|
videoTitle = videoTitle.replace(":", "") |
||||
|
videoPublishTime = str(video.videoPublishTime) |
||||
|
videoPublishTime = str(videoPublishTime).split("T")[0] |
||||
|
languages = str(video.videoLanguage) |
||||
|
srtfileName = f'{videoPublishTime}-{languages}-{videoTitle}.srt' |
||||
|
return srtfileName |
||||
|
|
||||
|
|
||||
|
def get_equal_rate_1(str1, str2): |
||||
|
return difflib.SequenceMatcher(None, str1, str2).quick_ratio() |
||||
|
|
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
# 读取配置文件 |
||||
|
with open('parse_video_config.json', 'r', encoding='utf-8') as f: |
||||
|
# 使用json.load()方法读取文件内容 |
||||
|
data = json.load(f) |
||||
|
|
||||
|
# 初始化日志 |
||||
|
Contant.logDir = data['log']['dir'] |
||||
|
Contant.logFileName = data['log']['fileName'] |
||||
|
initLogger(Contant.logDir, Contant.logFileName) |
||||
|
|
||||
|
# 连接mysql |
||||
|
dbHost = data['mysql']['host'] |
||||
|
dbPort = data['mysql']['port'] |
||||
|
dbUserName = data['mysql']['username'] |
||||
|
dbPassword = data['mysql']['password'] |
||||
|
dbDatabase = data['mysql']['database'] |
||||
|
Logger.info("尝试连接mysql host:'{}' port:'{}' username:'{}' password:'{}' database:'{}'", |
||||
|
dbHost, dbPort, dbUserName, dbPassword, dbDatabase) |
||||
|
Contant.engin = create_engine( |
||||
|
f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}') |
||||
|
Logger.info("连接mysql成功") |
||||
|
|
||||
|
parseRoot = data['parse_root'] |
||||
|
newSrtPaht = data['new_srt_path'] |
||||
|
|
||||
|
Logger.info(f'parseRoot: {parseRoot}') |
||||
|
chanelNameList = [] |
||||
|
for root, dirs, filenames in os.walk(parseRoot): |
||||
|
channelName = str(root).replace(f"{parseRoot}/", "") |
||||
|
chanelNameList.append(channelName) |
||||
|
|
||||
|
# 获取所有Channel |
||||
|
channel_dict = {} |
||||
|
channels = ChannelService.queryAllChannel() |
||||
|
for i in range(len(channels)): |
||||
|
channel: Channel = channels[i] |
||||
|
channelTitle = channel.channelTitle |
||||
|
for channelName in chanelNameList: |
||||
|
if get_equal_rate_1(channelTitle, channelName) > 0.9: |
||||
|
channel_dict[str(channel.channelId) |
||||
|
] = f"{parseRoot}/{channelName}" |
||||
|
|
||||
|
# 遍历channel_dict,复制字幕文件 |
||||
|
for key, value in channel_dict.items(): |
||||
|
channel: Channel = ChannelService.queryOneByChannelId(key) |
||||
|
videos = VideoService.queryAllbyChannelId(key) |
||||
|
Logger.info(f"key: {key} len: {len(videos)}") |
||||
|
for i in range(len(videos)): |
||||
|
video: Video = videos[i] |
||||
|
srtFileName = getSrtFileName(video=video) |
||||
|
for root, dirs, filenames in os.walk(value): |
||||
|
for filename in filenames: |
||||
|
if get_equal_rate_1(srtFileName, filename) > 0.8: |
||||
|
src_path = f"{root}/{filename}" |
||||
|
dst_path = f"{newSrtPaht}/{channel.region}/{channel.channelId}-{channel.channelTitle}" |
||||
|
if not os.path.exists(dst_path): |
||||
|
Logger.info("开始创建文件夹:" + dst_path) |
||||
|
os.makedirs(dst_path) |
||||
|
dst_path = f"{dst_path}/{video.videoId}.srt" |
||||
|
Logger.info(f"src_path:{src_path} dst_path:{dst_path}") |
||||
|
copyfile(src_path, dst_path) |
||||
|
# 并且读取srt文件到数据库 |
||||
|
DownloadUtil.iterateSrt( |
||||
|
srtFilePath=dst_path, videoId=video.videoId, channelId=channel.channelId) |
@ -0,0 +1,15 @@ |
|||||
|
{ |
||||
|
"mysql": { |
||||
|
"host": "47.108.20.249", |
||||
|
"port": "3306", |
||||
|
"username": "root", |
||||
|
"password": "casino888!", |
||||
|
"database": "youtube" |
||||
|
}, |
||||
|
"log": { |
||||
|
"dir": "./logs", |
||||
|
"fileName": "parse_video" |
||||
|
}, |
||||
|
"parse_root": "E:/code/python/tmp_srt_file", |
||||
|
"new_srt_path": "/mnt/new_srt_path" |
||||
|
} |
Loading…
Reference in new issue