Browse Source

修复bug

master
appolli 8 months ago
parent
commit
6bf887537e
  1. 7
      common/DownloadUtils.py
  2. 8
      common/YoutubeUtils.py
  3. 4
      download_video.py
  4. 34
      insert_keyword.py
  5. 27
      parse_video.py
  6. 1
      search_video.py
  7. 10
      service/DownloadInfoService.py
  8. 29
      test.py

7
common/DownloadUtils.py

@ -34,7 +34,7 @@ class DownloadUtil:
ordinal = 1 ordinal = 1
for sub in subs: for sub in subs:
srtStartTime = str(sub.start.to_time()).rstrip("0") srtStartTime = str(sub.start.to_time()).rstrip("0")
if ordinal == 1: if sub.start.to_time() == "00:00:00":
srtStartTime = sub.start.to_time() srtStartTime = sub.start.to_time()
srtEndTime = str(sub.end.to_time()).rstrip("0") srtEndTime = str(sub.end.to_time()).rstrip("0")
srtFile: Srtfile = Srtfile(videoId=videoId, channelId=channelId, ordinal=ordinal, srtFile: Srtfile = Srtfile(videoId=videoId, channelId=channelId, ordinal=ordinal,
@ -58,7 +58,7 @@ class DownloadUtil:
fileName = "{}".format(videoId) fileName = "{}".format(videoId)
ys.download(output_path=mp3OutPutPath, filename=fileName, mp3=True) ys.download(output_path=mp3OutPutPath, filename=fileName, mp3=True)
@func_set_timeout(60) @func_set_timeout(120)
def downloadOne(videoId, rootPath): def downloadOne(videoId, rootPath):
video: Video = VideoService.queryOneByVideoId(videoId=videoId) video: Video = VideoService.queryOneByVideoId(videoId=videoId)
channel: Channel = ChannelService.queryOneByChannelId(video.channelId) channel: Channel = ChannelService.queryOneByChannelId(video.channelId)
@ -121,5 +121,6 @@ class DownloadUtil:
if downloadInfo is not None: if downloadInfo is not None:
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 1) DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 1)
else: else:
Logger.error("VideoId:{},下载失败".format(videoId))
if downloadInfo is not None: if downloadInfo is not None:
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 0) DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 1)

8
common/YoutubeUtils.py

@ -7,6 +7,7 @@ import time
from entity.ChannelEntity import Channel from entity.ChannelEntity import Channel
from entity.VideoEntity import Video from entity.VideoEntity import Video
from service.ChannelService import ChannelService from service.ChannelService import ChannelService
from service.DownloadInfoService import DownloadInfoService
from service.VideoService import VideoService from service.VideoService import VideoService
@ -128,6 +129,8 @@ class YouTubeUtil:
Logger.info( Logger.info(
"存储VideoUrl:https://www.youtube.com/watch?v=" + videoId "存储VideoUrl:https://www.youtube.com/watch?v=" + videoId
) )
# 新增下载任务
DownloadInfoService.insertOne(videoId=videoId)
else: else:
Logger.info("已存在VideoId:{}".format(videoId)) Logger.info("已存在VideoId:{}".format(videoId))
idList.append(str(videoId)) idList.append(str(videoId))
@ -150,8 +153,9 @@ class YouTubeUtil:
Logger.error(e) Logger.error(e)
# 获取最后一个视频 # 获取最后一个视频
video: Video = VideoService.getLastVideoByChannelId(channelId) video: Video = VideoService.getLastVideoByChannelId(channelId)
ChannelService.updateTimeByChannelId( if video != None:
channelId, video.videoPublishTime) ChannelService.updateTimeByChannelId(
channelId, video.videoPublishTime)
time.sleep(5) time.sleep(5)
# 继续获取下一页 # 继续获取下一页
try: try:

4
download_video.py

@ -43,11 +43,11 @@ if __name__ == "__main__":
exit exit
Logger.info(f"downloadRootPaht: '{downloadRootPath}'") Logger.info(f"downloadRootPaht: '{downloadRootPath}'")
downloadInfo: DownloadInfo = DownloadInfoService.getOneNoFinish() downloadInfo: DownloadInfo = DownloadInfoService.getOneNoFinish()
if downloadInfo != None: while downloadInfo != None:
Logger.info("开始下载videoId:{}".format(downloadInfo.videoId)) Logger.info("开始下载videoId:{}".format(downloadInfo.videoId))
# 下载字幕文件或音频文件 # 下载字幕文件或音频文件
DownloadUtil.downloadOne(downloadInfo.videoId, downloadRootPath) DownloadUtil.downloadOne(downloadInfo.videoId, downloadRootPath)
# 重新获取下一个下载任务 # 重新获取下一个下载任务
downloadInfo: DownloadInfo = DownloadInfoService.getOneNoFinish() downloadInfo = DownloadInfoService.getOneNoFinish()
else: else:
Logger.info("完成下载") Logger.info("完成下载")

34
insert_keyword.py

@ -11,6 +11,7 @@ from service.KeyWordService import KeyWordService
import operator import operator
import argparse import argparse
import pandas as pd import pandas as pd
from common.Utils import getSession
if __name__ == "__main__": if __name__ == "__main__":
@ -39,40 +40,43 @@ if __name__ == "__main__":
Contant.engin = create_engine( Contant.engin = create_engine(
f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}') f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}')
Logger.info("连接mysql成功") Logger.info("连接mysql成功")
session = getSession()
# 读取csv文件 # 读取csv文件
df = pd.read_csv(csvFile, encoding="utf-8") df = pd.read_csv(csvFile, encoding="utf-8")
length = df.shape[0] length = df.shape[0]
keyWords = []
for i in range(0, length): for i in range(0, length):
region = df.iloc[i]['market'] region = df.iloc[i]['market']
lname = str(df.iloc[i]["lname"]).strip() lname = str(df.iloc[i]["lname"]).strip()
sname = str(df.iloc[i]["sname"]).strip() sname = str(df.iloc[i]["sname"]).strip()
# 判断是否存在如果不存在就存入 # 判断是否存在如果不存在就存入
region = "Taiwan" region = "Taiwan"
keyWord: Keyword = KeyWordService.queryOneByRegionWord( keyWord: Keyword = session.query(Keyword).filter(
region=region, word=lname) Keyword.region == region, Keyword.word == lname).one_or_none()
if keyWord == None: if keyWord == None:
keyWords.append(Keyword(region=region, word=lname)) session.add(Keyword(region=region, word=lname))
session.commit()
Logger.info(f"region:{region},keyword:{lname}") Logger.info(f"region:{region},keyword:{lname}")
keyWord: Keyword = KeyWordService.queryOneByRegionWord( keyWord: Keyword = session.query(Keyword).filter(
region=region, word=sname) Keyword.region == region, Keyword.word == sname).one_or_none()
if keyWord == None: if keyWord == None:
keyWords.append(Keyword(region=region, word=sname)) session.add(Keyword(region=region, word=sname))
session.commit()
Logger.info(f"region:{region},keyword:{sname}") Logger.info(f"region:{region},keyword:{sname}")
region = "Hongkong" region = "Hongkong"
keyWord: Keyword = KeyWordService.queryOneByRegionWord( keyWord: Keyword = session.query(Keyword).filter(
region=region, word=lname) Keyword.region == region, Keyword.word == lname).one_or_none()
if keyWord == None: if keyWord == None:
keyWords.append(Keyword(region=region, word=lname)) session.add(Keyword(region=region, word=lname))
session.commit()
Logger.info(f"region:{region},keyword:{lname}") Logger.info(f"region:{region},keyword:{lname}")
keyWord: Keyword = KeyWordService.queryOneByRegionWord( keyWord: Keyword = session.query(Keyword).filter(
region=region, word=sname) Keyword.region == region, Keyword.word == sname).one_or_none()
if keyWord == None: if keyWord == None:
keyWords.append(Keyword(region=region, word=sname)) session.add(Keyword(region=region, word=sname))
session.commit()
Logger.info(f"region:{region},keyword:{sname}") Logger.info(f"region:{region},keyword:{sname}")
KeyWordService.insterKeyWords(keyWords=keyWords)

27
parse_video.py

@ -18,6 +18,7 @@ import operator
import argparse import argparse
import difflib import difflib
from shutil import copyfile from shutil import copyfile
import shutil
def get_all_files(directory): def get_all_files(directory):
@ -102,20 +103,36 @@ if __name__ == "__main__":
channel: Channel = ChannelService.queryOneByChannelId(key) channel: Channel = ChannelService.queryOneByChannelId(key)
videos = VideoService.queryAllbyChannelId(key) videos = VideoService.queryAllbyChannelId(key)
Logger.info(f"key: {key} len: {len(videos)}") Logger.info(f"key: {key} len: {len(videos)}")
channelRoot = ""
moveCount = 0
for i in range(len(videos)): for i in range(len(videos)):
video: Video = videos[i] video: Video = videos[i]
srtFileName = getSrtFileName(video=video) srtFileName = getSrtFileName(video=video)
Logger.info(f"匹配video: {video.videoId} i:{i}")
for root, dirs, filenames in os.walk(value): for root, dirs, filenames in os.walk(value):
channelRoot = root
breakFlag = False
for filename in filenames: for filename in filenames:
if get_equal_rate_1(srtFileName, filename) > 0.8: diff = get_equal_rate_1(srtFileName, filename)
if diff > 0.7:
src_path = f"{root}/{filename}" src_path = f"{root}/{filename}"
dst_path = f"{newSrtPaht}/{channel.region}/{channel.channelId}-{channel.channelTitle}" dst_path = f"{newSrtPaht}/{channel.region}/{channel.channelId}-{channel.channelTitle}"
if not os.path.exists(dst_path): if not os.path.exists(dst_path):
Logger.info("开始创建文件夹:" + dst_path) Logger.info("开始创建文件夹:" + dst_path)
os.makedirs(dst_path) os.makedirs(dst_path)
dst_path = f"{dst_path}/{video.videoId}.srt" dst_path = f"{dst_path}/{video.videoId}.srt"
Logger.info(f"src_path:{src_path} dst_path:{dst_path}") Logger.info(
copyfile(src_path, dst_path) f"src_path:{src_path} dst_path:{dst_path} diff:{diff} i:{i}")
shutil.move(src_path, dst_path)
moveCount = moveCount + 1
breakFlag = True
break
# 并且读取srt文件到数据库 # 并且读取srt文件到数据库
DownloadUtil.iterateSrt( # DownloadUtil.iterateSrt(
srtFilePath=dst_path, videoId=video.videoId, channelId=channel.channelId) # srtFilePath=dst_path, videoId=video.videoId, channelId=channel.channelId)
if breakFlag:
break
# 删除channel文件
Logger.info(f"删除channelRoot:{channelRoot} 移动总计:{moveCount}")
shutil.rmtree(channelRoot)
time.sleep(5)

1
search_video.py

@ -51,3 +51,4 @@ if __name__ == "__main__":
f"Id:{channel.id} channelId:{channel.channelId} startTime:{startTime} endTime:{endTime}") f"Id:{channel.id} channelId:{channel.channelId} startTime:{startTime} endTime:{endTime}")
YouTubeUtil.getByChannelId( YouTubeUtil.getByChannelId(
channelId=channel.channelId, startTime=startTime, endTime=endTime) channelId=channel.channelId, startTime=startTime, endTime=endTime)
# YouTubeUtil.getByChannelId(channelId="UC67Wr_9pA4I0glIxDt_Cpyw",startTime=startTime, endTime=endTime)

10
service/DownloadInfoService.py

@ -22,7 +22,15 @@ class DownloadInfoService:
def updateIsFinishByVideoId(videoId, tryTime, isFinish): def updateIsFinishByVideoId(videoId, tryTime, isFinish):
session = getSession() session = getSession()
updateSql = update(DownloadInfo).where( updateSql = update(DownloadInfo).where(
DownloadInfo.videoId == videoId).values(tryTime=tryTime, isFinish=isFinish) DownloadInfo.videoId == videoId).values(tryTime=tryTime, isFinished=isFinish)
resutl = session.execute(updateSql) resutl = session.execute(updateSql)
session.commit() session.commit()
session.close() session.close()
def insertOne(videoId):
session = getSession()
downloadInfo: DownloadInfo = DownloadInfo(videoId=videoId, downloadType=1,
tryTime=0, isFinished=0)
session.add(downloadInfo)
session.commit()
session.close()

29
test.py

@ -1,3 +1,4 @@
import time
from LoggerUtils import Logger, initLogger from LoggerUtils import Logger, initLogger
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
from urllib.request import urlopen, Request from urllib.request import urlopen, Request
@ -8,10 +9,14 @@ from entity.ChannelEntity import Channel
from entity.VideoEntity import Video from entity.VideoEntity import Video
from service.ChannelService import ChannelService from service.ChannelService import ChannelService
from service.VideoService import VideoService from service.VideoService import VideoService
from service.ChannelService import ChannelService
from service.DownloadInfoService import DownloadInfoService
from common.YoutubeUtils import YouTubeUtil from common.YoutubeUtils import YouTubeUtil
from common.DownloadUtils import DownloadUtil from common.DownloadUtils import DownloadUtil
import operator import operator
import argparse import argparse
import os
from common.Utils import getSession
if __name__ == "__main__": if __name__ == "__main__":
# 读取配置文件 # 读取配置文件
@ -35,7 +40,7 @@ if __name__ == "__main__":
Contant.engin = create_engine( Contant.engin = create_engine(
f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}') f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}')
Logger.info("连接mysql成功") Logger.info("连接mysql成功")
# session = getSession()
# YouTubeUtil测试 # YouTubeUtil测试
# channelId = "UCBM86JVoHLqg9irpR2XKvGw" # channelId = "UCBM86JVoHLqg9irpR2XKvGw"
# startTime = "2024-08-22T00:00:01Z" # startTime = "2024-08-22T00:00:01Z"
@ -53,3 +58,25 @@ if __name__ == "__main__":
# videoId=videoId, channelId=channelId) # videoId=videoId, channelId=channelId)
# DownloadUtil.downLoadMP3(videoId=videoId, storePath=storePath) # DownloadUtil.downLoadMP3(videoId=videoId, storePath=storePath)
DownloadInfoService.updateIsFinishByVideoId("CaLR6W_cyeI",1,1)
# 遍历字幕文件,并存入数据库
# srtRootPath = "/mnt/mysql_srt_path_tmp"
# for root, dirs, filenames in os.walk(srtRootPath):
# Logger.info(f"root: {root} filesLen:{len(filenames)}")
# for filename in filenames:
# srtPath = f"{root}/{filename}"
# videoId = filename.replace(".srt", "")
# video: Video = session.query(Video).filter(
# Video.videoId == videoId).one_or_none()
# if video == None:
# continue
# channelId = video.channelId
# Logger.info(
# f"videoId: {videoId},channelId :{channelId}, srtPath: {srtPath}")
# DownloadUtil.iterateSrt(srtFilePath=srtPath,
# videoId=videoId, channelId=channelId)
# os.remove(srtPath)
# 关闭session
# session.close()

Loading…
Cancel
Save