Browse Source

修复bug

master
appolli 8 months ago
parent
commit
6bf887537e
  1. 7
      common/DownloadUtils.py
  2. 4
      common/YoutubeUtils.py
  3. 4
      download_video.py
  4. 34
      insert_keyword.py
  5. 27
      parse_video.py
  6. 1
      search_video.py
  7. 10
      service/DownloadInfoService.py
  8. 29
      test.py

7
common/DownloadUtils.py

@ -34,7 +34,7 @@ class DownloadUtil:
ordinal = 1
for sub in subs:
srtStartTime = str(sub.start.to_time()).rstrip("0")
if ordinal == 1:
if sub.start.to_time() == "00:00:00":
srtStartTime = sub.start.to_time()
srtEndTime = str(sub.end.to_time()).rstrip("0")
srtFile: Srtfile = Srtfile(videoId=videoId, channelId=channelId, ordinal=ordinal,
@ -58,7 +58,7 @@ class DownloadUtil:
fileName = "{}".format(videoId)
ys.download(output_path=mp3OutPutPath, filename=fileName, mp3=True)
@func_set_timeout(60)
@func_set_timeout(120)
def downloadOne(videoId, rootPath):
video: Video = VideoService.queryOneByVideoId(videoId=videoId)
channel: Channel = ChannelService.queryOneByChannelId(video.channelId)
@ -121,5 +121,6 @@ class DownloadUtil:
if downloadInfo is not None:
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 1)
else:
Logger.error("VideoId:{},下载失败".format(videoId))
if downloadInfo is not None:
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 0)
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 1)

4
common/YoutubeUtils.py

@ -7,6 +7,7 @@ import time
from entity.ChannelEntity import Channel
from entity.VideoEntity import Video
from service.ChannelService import ChannelService
from service.DownloadInfoService import DownloadInfoService
from service.VideoService import VideoService
@ -128,6 +129,8 @@ class YouTubeUtil:
Logger.info(
"存储VideoUrl:https://www.youtube.com/watch?v=" + videoId
)
# 新增下载任务
DownloadInfoService.insertOne(videoId=videoId)
else:
Logger.info("已存在VideoId:{}".format(videoId))
idList.append(str(videoId))
@ -150,6 +153,7 @@ class YouTubeUtil:
Logger.error(e)
# 获取最后一个视频
video: Video = VideoService.getLastVideoByChannelId(channelId)
if video != None:
ChannelService.updateTimeByChannelId(
channelId, video.videoPublishTime)
time.sleep(5)

4
download_video.py

@ -43,11 +43,11 @@ if __name__ == "__main__":
exit
Logger.info(f"downloadRootPaht: '{downloadRootPath}'")
downloadInfo: DownloadInfo = DownloadInfoService.getOneNoFinish()
if downloadInfo != None:
while downloadInfo != None:
Logger.info("开始下载videoId:{}".format(downloadInfo.videoId))
# 下载字幕文件或音频文件
DownloadUtil.downloadOne(downloadInfo.videoId, downloadRootPath)
# 重新获取下一个下载任务
downloadInfo: DownloadInfo = DownloadInfoService.getOneNoFinish()
downloadInfo = DownloadInfoService.getOneNoFinish()
else:
Logger.info("完成下载")

34
insert_keyword.py

@ -11,6 +11,7 @@ from service.KeyWordService import KeyWordService
import operator
import argparse
import pandas as pd
from common.Utils import getSession
if __name__ == "__main__":
@ -39,40 +40,43 @@ if __name__ == "__main__":
Contant.engin = create_engine(
f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}')
Logger.info("连接mysql成功")
session = getSession()
# 读取csv文件
df = pd.read_csv(csvFile, encoding="utf-8")
length = df.shape[0]
keyWords = []
for i in range(0, length):
region = df.iloc[i]['market']
lname = str(df.iloc[i]["lname"]).strip()
sname = str(df.iloc[i]["sname"]).strip()
# 判断是否存在如果不存在就存入
region = "Taiwan"
keyWord: Keyword = KeyWordService.queryOneByRegionWord(
region=region, word=lname)
keyWord: Keyword = session.query(Keyword).filter(
Keyword.region == region, Keyword.word == lname).one_or_none()
if keyWord == None:
keyWords.append(Keyword(region=region, word=lname))
session.add(Keyword(region=region, word=lname))
session.commit()
Logger.info(f"region:{region},keyword:{lname}")
keyWord: Keyword = KeyWordService.queryOneByRegionWord(
region=region, word=sname)
keyWord: Keyword = session.query(Keyword).filter(
Keyword.region == region, Keyword.word == sname).one_or_none()
if keyWord == None:
keyWords.append(Keyword(region=region, word=sname))
session.add(Keyword(region=region, word=sname))
session.commit()
Logger.info(f"region:{region},keyword:{sname}")
region = "Hongkong"
keyWord: Keyword = KeyWordService.queryOneByRegionWord(
region=region, word=lname)
keyWord: Keyword = session.query(Keyword).filter(
Keyword.region == region, Keyword.word == lname).one_or_none()
if keyWord == None:
keyWords.append(Keyword(region=region, word=lname))
session.add(Keyword(region=region, word=lname))
session.commit()
Logger.info(f"region:{region},keyword:{lname}")
keyWord: Keyword = KeyWordService.queryOneByRegionWord(
region=region, word=sname)
keyWord: Keyword = session.query(Keyword).filter(
Keyword.region == region, Keyword.word == sname).one_or_none()
if keyWord == None:
keyWords.append(Keyword(region=region, word=sname))
session.add(Keyword(region=region, word=sname))
session.commit()
Logger.info(f"region:{region},keyword:{sname}")
KeyWordService.insterKeyWords(keyWords=keyWords)

27
parse_video.py

@ -18,6 +18,7 @@ import operator
import argparse
import difflib
from shutil import copyfile
import shutil
def get_all_files(directory):
@ -102,20 +103,36 @@ if __name__ == "__main__":
channel: Channel = ChannelService.queryOneByChannelId(key)
videos = VideoService.queryAllbyChannelId(key)
Logger.info(f"key: {key} len: {len(videos)}")
channelRoot = ""
moveCount = 0
for i in range(len(videos)):
video: Video = videos[i]
srtFileName = getSrtFileName(video=video)
Logger.info(f"匹配video: {video.videoId} i:{i}")
for root, dirs, filenames in os.walk(value):
channelRoot = root
breakFlag = False
for filename in filenames:
if get_equal_rate_1(srtFileName, filename) > 0.8:
diff = get_equal_rate_1(srtFileName, filename)
if diff > 0.7:
src_path = f"{root}/{filename}"
dst_path = f"{newSrtPaht}/{channel.region}/{channel.channelId}-{channel.channelTitle}"
if not os.path.exists(dst_path):
Logger.info("开始创建文件夹:" + dst_path)
os.makedirs(dst_path)
dst_path = f"{dst_path}/{video.videoId}.srt"
Logger.info(f"src_path:{src_path} dst_path:{dst_path}")
copyfile(src_path, dst_path)
Logger.info(
f"src_path:{src_path} dst_path:{dst_path} diff:{diff} i:{i}")
shutil.move(src_path, dst_path)
moveCount = moveCount + 1
breakFlag = True
break
# 并且读取srt文件到数据库
DownloadUtil.iterateSrt(
srtFilePath=dst_path, videoId=video.videoId, channelId=channel.channelId)
# DownloadUtil.iterateSrt(
# srtFilePath=dst_path, videoId=video.videoId, channelId=channel.channelId)
if breakFlag:
break
# 删除channel文件
Logger.info(f"删除channelRoot:{channelRoot} 移动总计:{moveCount}")
shutil.rmtree(channelRoot)
time.sleep(5)

1
search_video.py

@ -51,3 +51,4 @@ if __name__ == "__main__":
f"Id:{channel.id} channelId:{channel.channelId} startTime:{startTime} endTime:{endTime}")
YouTubeUtil.getByChannelId(
channelId=channel.channelId, startTime=startTime, endTime=endTime)
# YouTubeUtil.getByChannelId(channelId="UC67Wr_9pA4I0glIxDt_Cpyw",startTime=startTime, endTime=endTime)

10
service/DownloadInfoService.py

@ -22,7 +22,15 @@ class DownloadInfoService:
def updateIsFinishByVideoId(videoId, tryTime, isFinish):
session = getSession()
updateSql = update(DownloadInfo).where(
DownloadInfo.videoId == videoId).values(tryTime=tryTime, isFinish=isFinish)
DownloadInfo.videoId == videoId).values(tryTime=tryTime, isFinished=isFinish)
resutl = session.execute(updateSql)
session.commit()
session.close()
def insertOne(videoId):
session = getSession()
downloadInfo: DownloadInfo = DownloadInfo(videoId=videoId, downloadType=1,
tryTime=0, isFinished=0)
session.add(downloadInfo)
session.commit()
session.close()

29
test.py

@ -1,3 +1,4 @@
import time
from LoggerUtils import Logger, initLogger
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen, Request
@ -8,10 +9,14 @@ from entity.ChannelEntity import Channel
from entity.VideoEntity import Video
from service.ChannelService import ChannelService
from service.VideoService import VideoService
from service.ChannelService import ChannelService
from service.DownloadInfoService import DownloadInfoService
from common.YoutubeUtils import YouTubeUtil
from common.DownloadUtils import DownloadUtil
import operator
import argparse
import os
from common.Utils import getSession
if __name__ == "__main__":
# 读取配置文件
@ -35,7 +40,7 @@ if __name__ == "__main__":
Contant.engin = create_engine(
f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}')
Logger.info("连接mysql成功")
# session = getSession()
# YouTubeUtil测试
# channelId = "UCBM86JVoHLqg9irpR2XKvGw"
# startTime = "2024-08-22T00:00:01Z"
@ -53,3 +58,25 @@ if __name__ == "__main__":
# videoId=videoId, channelId=channelId)
# DownloadUtil.downLoadMP3(videoId=videoId, storePath=storePath)
DownloadInfoService.updateIsFinishByVideoId("CaLR6W_cyeI",1,1)
# 遍历字幕文件,并存入数据库
# srtRootPath = "/mnt/mysql_srt_path_tmp"
# for root, dirs, filenames in os.walk(srtRootPath):
# Logger.info(f"root: {root} filesLen:{len(filenames)}")
# for filename in filenames:
# srtPath = f"{root}/{filename}"
# videoId = filename.replace(".srt", "")
# video: Video = session.query(Video).filter(
# Video.videoId == videoId).one_or_none()
# if video == None:
# continue
# channelId = video.channelId
# Logger.info(
# f"videoId: {videoId},channelId :{channelId}, srtPath: {srtPath}")
# DownloadUtil.iterateSrt(srtFilePath=srtPath,
# videoId=videoId, channelId=channelId)
# os.remove(srtPath)
# 关闭session
# session.close()

Loading…
Cancel
Save