Browse Source

新增download_video

master
zhangshu 8 months ago
parent
commit
f78d832473
  1. 7
      .vscode/launch.json
  2. 122
      common/DownloadUtils.py
  3. 46
      download_video.py
  4. 13
      download_video_config.json
  5. 15
      entity/DownloadInfoEntity.py
  6. 17
      entity/SrtFileEntity.py
  7. 1
      search_video.py
  8. 28
      service/DownloadInfoService.py
  9. 23
      service/SrtFileService.py
  10. 11
      service/VideoService.py

7
.vscode/launch.json

@ -27,6 +27,13 @@
"args": ["--start", "2023-09-10T00:00:01Z",
"--end", "2023-09-11T00:00:01Z"]
},
{
"name": "download video",
"type": "debugpy",
"request": "launch",
"program": "download_video.py",
"console": "integratedTerminal"
},
{
"name": "test",
"type": "debugpy",

122
common/DownloadUtils.py

@ -0,0 +1,122 @@
from shutil import copyfile
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import SRTFormatter
from LoggerUtils import Logger
import time
import os
from func_timeout import func_set_timeout
import operator
import pysrt
from pytubefix import YouTube
from pytubefix.cli import on_progress
from entity.VideoEntity import Video
from entity.ChannelEntity import Channel
from entity.DownloadInfoEntity import DownloadInfo
from entity.SrtFileEntity import Srtfile
from service.VideoService import VideoService
from service.ChannelService import ChannelService
from service.DownloadInfoService import DownloadInfoService
from service.SrtFileService import SrtFileService
class DownloadUtil:
formatter = SRTFormatter()
# proxies = {"http": "http://127.0.0.1:7890",
# "https": "https://127.0.0.1:7890"}
def iterateSrt(srtFilePath, videoId, channelId):
# 查询是否存在
if SrtFileService.checkExistsByVideoId(videoId):
Logger.info("VideoId: {} 已收录", videoId)
return
subs = pysrt.open(srtFilePath)
ordinal = 1
for sub in subs:
srtStartTime = str(sub.start.to_time()).rstrip("0")
srtEndTime = str(sub.end.to_time()).rstrip("0")
SrtFileService.insertOne(videoId=videoId, channelId=channelId, ordinal=ordinal,
srtStartTime=srtStartTime, srtEndTime=srtEndTime, srtText=sub.text, isScan=0)
ordinal = ordinal + 1
def downLoadMP3(videoId, storePath):
video:Video = VideoService.getOneByVideoId(videoId)
channel:Channel = ChannelService.queryOneByChannelId(video.channelId)
videoUrl = "https://www.youtube.com/watch?v={}".format(videoId)
yt = YouTube(videoUrl, on_progress_callback=on_progress)
ys = yt.streams.get_audio_only()
mp3OutPutPath = storePath
if not os.path.exists(mp3OutPutPath):
Logger.info("开始创建文件夹:" + mp3OutPutPath)
os.makedirs(mp3OutPutPath)
fileName = "{}.mp3".format(videoId)
ys.download(output_path=mp3OutPutPath, filename=fileName, mp3=True)
@func_set_timeout(60)
def downloadOne(videoId):
video: Video = VideoService.queryOneByVideoId(videoId=videoId)
channel: Channel = ChannelService.queryOneByChannelId(video.channelId)
# 开始下载
Logger.info("开始下载...{}".format(videoId))
# 配置下载地址 /mnt/youtube_mysql
mainPath = "/mnt/youtube_mysql/srt/main"
tmpPath = "/mnt/youtube_mysql/srt/tmp"
# TODO 删除测试代码
mainPath = "E:/code/tmp/main"
tmpPath = "E:/code/tmp/tmp"
storePath = "{}/{}/{}-{}".format(
mainPath, channel.region, channel.channelId, channel.channelTitle)
cpPath = "{}/{}/{}-{}".format(
tmpPath, channel.region, channel.channelId, channel.channelTitle)
if not os.path.exists(storePath):
Logger.info("开始创建文件夹:" + storePath)
os.makedirs(storePath)
if not os.path.exists(cpPath):
Logger.info("开始创建文件夹:" + cpPath)
os.makedirs(cpPath)
# 获取字幕文件名
storePathSrt = "{}/{}.srt".format(storePath, videoId)
cpPathSrt = "{}/{}.srt".format(cpPath, videoId)
# 判断文件是否存在
if os.path.exists(storePathSrt):
Logger.info("{}已存在", storePathSrt)
return
try:
# 下载字幕文件,并复制
videoSrt = YouTubeTranscriptApi.get_transcript(
videoId, languages=[video.videoLanguage])
srt_formatted = DownloadUtil.formatter.format_transcript(videoSrt)
Logger.info("文件地址...{}".format(storePathSrt))
with open(storePathSrt, 'w', encoding='utf-8') as srt_file:
srt_file.write(srt_formatted)
Logger.info("下载完成...{}".format(videoId))
copyfile(storePathSrt, cpPathSrt)
# 修改video数据
VideoService.upIsDownloadByVideoId(videoId, 1)
# 修改downloadInfo
downloadInfo: DownloadInfo = DownloadInfoService.getOneByVideoId(
videoId=videoId)
if downloadInfo is not None:
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 1)
DownloadUtil.iterateSrt(storePath, videoId, video.channelId)
pass
except Exception as e:
Logger.error(e)
logStr = "Exception...{}".format(e)
Logger.error(logStr)
# 修改downloadInfo,tryTime + 1
downloadInfo: DownloadInfo = DownloadInfoService.getOneByVideoId(
videoId=videoId)
if operator.contains(logStr, "No transcripts"):
Logger.error("VideoId:{},不存在字幕文件".format(videoId))
# 下载音频文件
DownloadUtil.downLoadMP3(videoId, storePath)
# 更新下载任务
if downloadInfo is not None:
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 1)
else:
if downloadInfo is not None:
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 0)

46
download_video.py

@ -0,0 +1,46 @@
import time
from LoggerUtils import Logger, initLogger
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen, Request
import json
import Contant
from sqlalchemy import create_engine
from entity.DownloadInfoEntity import DownloadInfo
from service.DownloadInfoService import DownloadInfoService
from common.YoutubeUtils import YouTubeUtil
import operator
import argparse
if __name__ == "__main__":
# 读取配置文件
with open('download_video_config.json', 'r', encoding='utf-8') as f:
# 使用json.load()方法读取文件内容
data = json.load(f)
# 初始化日志
Contant.logDir = data['log']['dir']
Contant.logFileName = data['log']['fileName']
initLogger(Contant.logDir, Contant.logFileName)
# 连接mysql
dbHost = data['mysql']['host']
dbPort = data['mysql']['port']
dbUserName = data['mysql']['username']
dbPassword = data['mysql']['password']
dbDatabase = data['mysql']['database']
Logger.info("尝试连接mysql host:'{}' port:'{}' username:'{}' password:'{}' database:'{}'",
dbHost, dbPort, dbUserName, dbPassword, dbDatabase)
Contant.engin = create_engine(
f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}')
Logger.info("连接mysql成功")
# 获取一个未下载的video
downloadInfo: DownloadInfo = DownloadInfoService.getOneNoFinish()
if downloadInfo != None:
Logger.info("开始下载videoId:{}".format(downloadInfo.videoId))
# TODO 下载字幕文件或音频文件
# 重新获取下一个下载任务
downloadInfo: DownloadInfo = DownloadInfoService.getOneNoFinish()
else:
Logger.info("完成下载")

13
download_video_config.json

@ -0,0 +1,13 @@
{
"mysql": {
"host": "47.108.20.249",
"port": "3306",
"username": "root",
"password": "casino888!",
"database": "youtube"
},
"log": {
"dir": "./logs",
"fileName": "download_video"
}
}

15
entity/DownloadInfoEntity.py

@ -0,0 +1,15 @@
from sqlalchemy import Column, Integer, String, Boolean, create_engine
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class DownloadInfo(Base):
__tablename__ = 'Download_info'
id = Column(Integer, primary_key=True, autoincrement=True)
videoId = Column(String(255), nullable=False)
downloadType = Column(Integer, nullable=False)
tryTime = Column(Integer, nullable=False)
isFinished = Column(Integer, nullable=False)

17
entity/SrtFileEntity.py

@ -0,0 +1,17 @@
from sqlalchemy import Column, Integer, String, create_engine
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class Srtfile(Base):
__tablename__ = 'Srtfile'
id = Column(Integer, primary_key=True, autoincrement=True)
videoId = Column(String(255), nullable=False)
channelId = Column(String(255), nullable=False)
ordinal = Column(Integer, nullable=False)
srtStartTime = Column(String(255), nullable=False)
srtEndTime = Column(String(255), nullable=False) # 同上
srtText = Column(String(255), nullable=False) # 如果文本可能很长,考虑增加长度
isScan = Column(Integer, nullable=True) # 允许NULL值

1
search_video.py

@ -40,7 +40,6 @@ if __name__ == "__main__":
f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}')
Logger.info("连接mysql成功")
YouTubeUtil.getByChannelId("channel.channelId", startTime,endTime)
# 查询出所有Channel
# channels = ChannelService.queryAllChannel()
# Logger.info("Channels length:{}".format(len(channels)))

28
service/DownloadInfoService.py

@ -0,0 +1,28 @@
from entity.DownloadInfoEntity import DownloadInfo
from common.Utils import getSession
from sqlalchemy import update
class DownloadInfoService:
def getOneNoFinish():
session = getSession()
downloadInfo = session.query(DownloadInfo).filter(
DownloadInfo.isFinished == 0, DownloadInfo.tryTime <= 2).first()
session.close()
return downloadInfo
def getOneByVideoId(videoId):
session = getSession()
downloadInfo = session.query(DownloadInfo).filter(
DownloadInfo.videoId == videoId).one_or_none()
session.close()
return downloadInfo
def updateIsFinishByVideoId(videoId, tryTime, isFinish):
session = getSession()
updateSql = update(DownloadInfo).where(
DownloadInfo.videoId == videoId).values(tryTime=tryTime, isFinish=isFinish)
resutl = session.execute(updateSql)
session.commit()
session.close()

23
service/SrtFileService.py

@ -0,0 +1,23 @@
from entity.SrtFileEntity import Srtfile
from common.Utils import getSession
from sqlalchemy import update
class SrtFileService:
def checkExistsByVideoId(videoid):
session = getSession()
srtFile: Srtfile = session.query(Srtfile).filter(
Srtfile.videoId == videoid).first()
session.close()
if srtFile is not None:
return True
else:
return False
def insertOne(videoId, channelId, ordinal, srtStartTime, srtEndTime, srtText, isScan):
session = getSession()
srtFile: Srtfile = Srtfile(videoId=videoId, channelId=channelId, ordinal=ordinal, srtStartTime=srtStartTime,
srtEndTime=srtEndTime, srtText=srtText, isScan=isScan)
session.add(srtFile)
session.commit()
session.close()

11
service/VideoService.py

@ -31,6 +31,15 @@ class VideoService:
def getLastVideoByChannelId(channelId):
session = getSession()
video:Video = session.query(Video).where(Video.channelId==channelId).order_by(Video.videoPublishTime.desc()).first()
video: Video = session.query(Video).where(
Video.channelId == channelId).order_by(Video.videoPublishTime.desc()).first()
session.close()
return video
def upIsDownloadByVideoId(videoId, isDownload):
session = getSession()
updateSql = update(Video).where(
Video.videoId == videoId).values(isDownload=isDownload)
resutl = session.execute(updateSql)
session.commit()
session.close()

Loading…
Cancel
Save