diff --git a/.vscode/launch.json b/.vscode/launch.json index 21d2d8a..8532091 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -27,6 +27,13 @@ "args": ["--start", "2023-09-10T00:00:01Z", "--end", "2023-09-11T00:00:01Z"] }, + { + "name": "download video", + "type": "debugpy", + "request": "launch", + "program": "download_video.py", + "console": "integratedTerminal" + }, { "name": "test", "type": "debugpy", diff --git a/common/DownloadUtils.py b/common/DownloadUtils.py new file mode 100644 index 0000000..c5e480a --- /dev/null +++ b/common/DownloadUtils.py @@ -0,0 +1,122 @@ +from shutil import copyfile +from youtube_transcript_api import YouTubeTranscriptApi +from youtube_transcript_api.formatters import SRTFormatter +from LoggerUtils import Logger +import time +import os +from func_timeout import func_set_timeout +import operator +import pysrt +from pytubefix import YouTube +from pytubefix.cli import on_progress +from entity.VideoEntity import Video +from entity.ChannelEntity import Channel +from entity.DownloadInfoEntity import DownloadInfo +from entity.SrtFileEntity import Srtfile +from service.VideoService import VideoService +from service.ChannelService import ChannelService +from service.DownloadInfoService import DownloadInfoService +from service.SrtFileService import SrtFileService + + +class DownloadUtil: + formatter = SRTFormatter() + # proxies = {"http": "http://127.0.0.1:7890", + # "https": "https://127.0.0.1:7890"} + + def iterateSrt(srtFilePath, videoId, channelId): + # 查询是否存在 + if SrtFileService.checkExistsByVideoId(videoId): + Logger.info("VideoId: {} 已收录", videoId) + return + subs = pysrt.open(srtFilePath) + ordinal = 1 + for sub in subs: + srtStartTime = str(sub.start.to_time()).rstrip("0") + srtEndTime = str(sub.end.to_time()).rstrip("0") + SrtFileService.insertOne(videoId=videoId, channelId=channelId, ordinal=ordinal, + srtStartTime=srtStartTime, srtEndTime=srtEndTime, srtText=sub.text, isScan=0) + ordinal = ordinal + 1 + + def downLoadMP3(videoId, storePath): + video:Video = VideoService.getOneByVideoId(videoId) + channel:Channel = ChannelService.queryOneByChannelId(video.channelId) + videoUrl = "https://www.youtube.com/watch?v={}".format(videoId) + yt = YouTube(videoUrl, on_progress_callback=on_progress) + ys = yt.streams.get_audio_only() + mp3OutPutPath = storePath + if not os.path.exists(mp3OutPutPath): + Logger.info("开始创建文件夹:" + mp3OutPutPath) + os.makedirs(mp3OutPutPath) + fileName = "{}.mp3".format(videoId) + ys.download(output_path=mp3OutPutPath, filename=fileName, mp3=True) + + @func_set_timeout(60) + def downloadOne(videoId): + video: Video = VideoService.queryOneByVideoId(videoId=videoId) + channel: Channel = ChannelService.queryOneByChannelId(video.channelId) + # 开始下载 + Logger.info("开始下载...{}".format(videoId)) + # 配置下载地址 /mnt/youtube_mysql + mainPath = "/mnt/youtube_mysql/srt/main" + tmpPath = "/mnt/youtube_mysql/srt/tmp" + # TODO 删除测试代码 + mainPath = "E:/code/tmp/main" + tmpPath = "E:/code/tmp/tmp" + + storePath = "{}/{}/{}-{}".format( + mainPath, channel.region, channel.channelId, channel.channelTitle) + cpPath = "{}/{}/{}-{}".format( + tmpPath, channel.region, channel.channelId, channel.channelTitle) + if not os.path.exists(storePath): + Logger.info("开始创建文件夹:" + storePath) + os.makedirs(storePath) + if not os.path.exists(cpPath): + Logger.info("开始创建文件夹:" + cpPath) + os.makedirs(cpPath) + + # 获取字幕文件名 + storePathSrt = "{}/{}.srt".format(storePath, videoId) + cpPathSrt = "{}/{}.srt".format(cpPath, videoId) + + # 判断文件是否存在 + if os.path.exists(storePathSrt): + Logger.info("{}已存在", storePathSrt) + return + try: + # 下载字幕文件,并复制 + videoSrt = YouTubeTranscriptApi.get_transcript( + videoId, languages=[video.videoLanguage]) + srt_formatted = DownloadUtil.formatter.format_transcript(videoSrt) + Logger.info("文件地址...{}".format(storePathSrt)) + with open(storePathSrt, 'w', encoding='utf-8') as srt_file: + srt_file.write(srt_formatted) + Logger.info("下载完成...{}".format(videoId)) + copyfile(storePathSrt, cpPathSrt) + + # 修改video数据 + VideoService.upIsDownloadByVideoId(videoId, 1) + # 修改downloadInfo + downloadInfo: DownloadInfo = DownloadInfoService.getOneByVideoId( + videoId=videoId) + if downloadInfo is not None: + DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 1) + DownloadUtil.iterateSrt(storePath, videoId, video.channelId) + pass + except Exception as e: + Logger.error(e) + logStr = "Exception...{}".format(e) + Logger.error(logStr) + # 修改downloadInfo,tryTime + 1 + downloadInfo: DownloadInfo = DownloadInfoService.getOneByVideoId( + videoId=videoId) + if operator.contains(logStr, "No transcripts"): + Logger.error("VideoId:{},不存在字幕文件".format(videoId)) + # 下载音频文件 + DownloadUtil.downLoadMP3(videoId, storePath) + # 更新下载任务 + if downloadInfo is not None: + DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 1) + else: + if downloadInfo is not None: + DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 0) diff --git a/download_video.py b/download_video.py new file mode 100644 index 0000000..d456092 --- /dev/null +++ b/download_video.py @@ -0,0 +1,46 @@ +import time +from LoggerUtils import Logger, initLogger +from bs4 import BeautifulSoup as bs +from urllib.request import urlopen, Request +import json +import Contant +from sqlalchemy import create_engine +from entity.DownloadInfoEntity import DownloadInfo +from service.DownloadInfoService import DownloadInfoService +from common.YoutubeUtils import YouTubeUtil +import operator +import argparse + + +if __name__ == "__main__": + # 读取配置文件 + with open('download_video_config.json', 'r', encoding='utf-8') as f: + # 使用json.load()方法读取文件内容 + data = json.load(f) + + # 初始化日志 + Contant.logDir = data['log']['dir'] + Contant.logFileName = data['log']['fileName'] + initLogger(Contant.logDir, Contant.logFileName) + + # 连接mysql + dbHost = data['mysql']['host'] + dbPort = data['mysql']['port'] + dbUserName = data['mysql']['username'] + dbPassword = data['mysql']['password'] + dbDatabase = data['mysql']['database'] + Logger.info("尝试连接mysql host:'{}' port:'{}' username:'{}' password:'{}' database:'{}'", + dbHost, dbPort, dbUserName, dbPassword, dbDatabase) + Contant.engin = create_engine( + f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}') + Logger.info("连接mysql成功") + + # 获取一个未下载的video + downloadInfo: DownloadInfo = DownloadInfoService.getOneNoFinish() + if downloadInfo != None: + Logger.info("开始下载videoId:{}".format(downloadInfo.videoId)) + # TODO 下载字幕文件或音频文件 + # 重新获取下一个下载任务 + downloadInfo: DownloadInfo = DownloadInfoService.getOneNoFinish() + else: + Logger.info("完成下载") diff --git a/download_video_config.json b/download_video_config.json new file mode 100644 index 0000000..f6e2a36 --- /dev/null +++ b/download_video_config.json @@ -0,0 +1,13 @@ +{ + "mysql": { + "host": "47.108.20.249", + "port": "3306", + "username": "root", + "password": "casino888!", + "database": "youtube" + }, + "log": { + "dir": "./logs", + "fileName": "download_video" + } +} \ No newline at end of file diff --git a/entity/DownloadInfoEntity.py b/entity/DownloadInfoEntity.py new file mode 100644 index 0000000..7d3bd1b --- /dev/null +++ b/entity/DownloadInfoEntity.py @@ -0,0 +1,15 @@ +from sqlalchemy import Column, Integer, String, Boolean, create_engine +from sqlalchemy.ext.declarative import declarative_base + +Base = declarative_base() + + +class DownloadInfo(Base): + __tablename__ = 'Download_info' + + id = Column(Integer, primary_key=True, autoincrement=True) + videoId = Column(String(255), nullable=False) + downloadType = Column(Integer, nullable=False) + tryTime = Column(Integer, nullable=False) + isFinished = Column(Integer, nullable=False) + diff --git a/entity/SrtFileEntity.py b/entity/SrtFileEntity.py new file mode 100644 index 0000000..72ef9f4 --- /dev/null +++ b/entity/SrtFileEntity.py @@ -0,0 +1,17 @@ +from sqlalchemy import Column, Integer, String, create_engine +from sqlalchemy.ext.declarative import declarative_base + +Base = declarative_base() + + +class Srtfile(Base): + __tablename__ = 'Srtfile' + + id = Column(Integer, primary_key=True, autoincrement=True) + videoId = Column(String(255), nullable=False) + channelId = Column(String(255), nullable=False) + ordinal = Column(Integer, nullable=False) + srtStartTime = Column(String(255), nullable=False) + srtEndTime = Column(String(255), nullable=False) # 同上 + srtText = Column(String(255), nullable=False) # 如果文本可能很长,考虑增加长度 + isScan = Column(Integer, nullable=True) # 允许NULL值 diff --git a/search_video.py b/search_video.py index 6fa2957..7446664 100644 --- a/search_video.py +++ b/search_video.py @@ -40,7 +40,6 @@ if __name__ == "__main__": f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}') Logger.info("连接mysql成功") - YouTubeUtil.getByChannelId("channel.channelId", startTime,endTime) # 查询出所有Channel # channels = ChannelService.queryAllChannel() # Logger.info("Channels length:{}".format(len(channels))) diff --git a/service/DownloadInfoService.py b/service/DownloadInfoService.py new file mode 100644 index 0000000..0776e48 --- /dev/null +++ b/service/DownloadInfoService.py @@ -0,0 +1,28 @@ +from entity.DownloadInfoEntity import DownloadInfo +from common.Utils import getSession +from sqlalchemy import update + + +class DownloadInfoService: + + def getOneNoFinish(): + session = getSession() + downloadInfo = session.query(DownloadInfo).filter( + DownloadInfo.isFinished == 0, DownloadInfo.tryTime <= 2).first() + session.close() + return downloadInfo + + def getOneByVideoId(videoId): + session = getSession() + downloadInfo = session.query(DownloadInfo).filter( + DownloadInfo.videoId == videoId).one_or_none() + session.close() + return downloadInfo + + def updateIsFinishByVideoId(videoId, tryTime, isFinish): + session = getSession() + updateSql = update(DownloadInfo).where( + DownloadInfo.videoId == videoId).values(tryTime=tryTime, isFinish=isFinish) + resutl = session.execute(updateSql) + session.commit() + session.close() diff --git a/service/SrtFileService.py b/service/SrtFileService.py new file mode 100644 index 0000000..dceb3ed --- /dev/null +++ b/service/SrtFileService.py @@ -0,0 +1,23 @@ +from entity.SrtFileEntity import Srtfile +from common.Utils import getSession +from sqlalchemy import update + + +class SrtFileService: + def checkExistsByVideoId(videoid): + session = getSession() + srtFile: Srtfile = session.query(Srtfile).filter( + Srtfile.videoId == videoid).first() + session.close() + if srtFile is not None: + return True + else: + return False + + def insertOne(videoId, channelId, ordinal, srtStartTime, srtEndTime, srtText, isScan): + session = getSession() + srtFile: Srtfile = Srtfile(videoId=videoId, channelId=channelId, ordinal=ordinal, srtStartTime=srtStartTime, + srtEndTime=srtEndTime, srtText=srtText, isScan=isScan) + session.add(srtFile) + session.commit() + session.close() diff --git a/service/VideoService.py b/service/VideoService.py index d36602a..54c03c5 100644 --- a/service/VideoService.py +++ b/service/VideoService.py @@ -31,6 +31,15 @@ class VideoService: def getLastVideoByChannelId(channelId): session = getSession() - video:Video = session.query(Video).where(Video.channelId==channelId).order_by(Video.videoPublishTime.desc()).first() + video: Video = session.query(Video).where( + Video.channelId == channelId).order_by(Video.videoPublishTime.desc()).first() session.close() return video + + def upIsDownloadByVideoId(videoId, isDownload): + session = getSession() + updateSql = update(Video).where( + Video.videoId == videoId).values(isDownload=isDownload) + resutl = session.execute(updateSql) + session.commit() + session.close()