10 changed files with 281 additions and 2 deletions
@ -0,0 +1,122 @@ |
|||||
|
from shutil import copyfile |
||||
|
from youtube_transcript_api import YouTubeTranscriptApi |
||||
|
from youtube_transcript_api.formatters import SRTFormatter |
||||
|
from LoggerUtils import Logger |
||||
|
import time |
||||
|
import os |
||||
|
from func_timeout import func_set_timeout |
||||
|
import operator |
||||
|
import pysrt |
||||
|
from pytubefix import YouTube |
||||
|
from pytubefix.cli import on_progress |
||||
|
from entity.VideoEntity import Video |
||||
|
from entity.ChannelEntity import Channel |
||||
|
from entity.DownloadInfoEntity import DownloadInfo |
||||
|
from entity.SrtFileEntity import Srtfile |
||||
|
from service.VideoService import VideoService |
||||
|
from service.ChannelService import ChannelService |
||||
|
from service.DownloadInfoService import DownloadInfoService |
||||
|
from service.SrtFileService import SrtFileService |
||||
|
|
||||
|
|
||||
|
class DownloadUtil: |
||||
|
formatter = SRTFormatter() |
||||
|
# proxies = {"http": "http://127.0.0.1:7890", |
||||
|
# "https": "https://127.0.0.1:7890"} |
||||
|
|
||||
|
def iterateSrt(srtFilePath, videoId, channelId): |
||||
|
# 查询是否存在 |
||||
|
if SrtFileService.checkExistsByVideoId(videoId): |
||||
|
Logger.info("VideoId: {} 已收录", videoId) |
||||
|
return |
||||
|
subs = pysrt.open(srtFilePath) |
||||
|
ordinal = 1 |
||||
|
for sub in subs: |
||||
|
srtStartTime = str(sub.start.to_time()).rstrip("0") |
||||
|
srtEndTime = str(sub.end.to_time()).rstrip("0") |
||||
|
SrtFileService.insertOne(videoId=videoId, channelId=channelId, ordinal=ordinal, |
||||
|
srtStartTime=srtStartTime, srtEndTime=srtEndTime, srtText=sub.text, isScan=0) |
||||
|
ordinal = ordinal + 1 |
||||
|
|
||||
|
def downLoadMP3(videoId, storePath): |
||||
|
video:Video = VideoService.getOneByVideoId(videoId) |
||||
|
channel:Channel = ChannelService.queryOneByChannelId(video.channelId) |
||||
|
videoUrl = "https://www.youtube.com/watch?v={}".format(videoId) |
||||
|
yt = YouTube(videoUrl, on_progress_callback=on_progress) |
||||
|
ys = yt.streams.get_audio_only() |
||||
|
mp3OutPutPath = storePath |
||||
|
if not os.path.exists(mp3OutPutPath): |
||||
|
Logger.info("开始创建文件夹:" + mp3OutPutPath) |
||||
|
os.makedirs(mp3OutPutPath) |
||||
|
fileName = "{}.mp3".format(videoId) |
||||
|
ys.download(output_path=mp3OutPutPath, filename=fileName, mp3=True) |
||||
|
|
||||
|
@func_set_timeout(60) |
||||
|
def downloadOne(videoId): |
||||
|
video: Video = VideoService.queryOneByVideoId(videoId=videoId) |
||||
|
channel: Channel = ChannelService.queryOneByChannelId(video.channelId) |
||||
|
# 开始下载 |
||||
|
Logger.info("开始下载...{}".format(videoId)) |
||||
|
# 配置下载地址 /mnt/youtube_mysql |
||||
|
mainPath = "/mnt/youtube_mysql/srt/main" |
||||
|
tmpPath = "/mnt/youtube_mysql/srt/tmp" |
||||
|
# TODO 删除测试代码 |
||||
|
mainPath = "E:/code/tmp/main" |
||||
|
tmpPath = "E:/code/tmp/tmp" |
||||
|
|
||||
|
storePath = "{}/{}/{}-{}".format( |
||||
|
mainPath, channel.region, channel.channelId, channel.channelTitle) |
||||
|
cpPath = "{}/{}/{}-{}".format( |
||||
|
tmpPath, channel.region, channel.channelId, channel.channelTitle) |
||||
|
if not os.path.exists(storePath): |
||||
|
Logger.info("开始创建文件夹:" + storePath) |
||||
|
os.makedirs(storePath) |
||||
|
if not os.path.exists(cpPath): |
||||
|
Logger.info("开始创建文件夹:" + cpPath) |
||||
|
os.makedirs(cpPath) |
||||
|
|
||||
|
# 获取字幕文件名 |
||||
|
storePathSrt = "{}/{}.srt".format(storePath, videoId) |
||||
|
cpPathSrt = "{}/{}.srt".format(cpPath, videoId) |
||||
|
|
||||
|
# 判断文件是否存在 |
||||
|
if os.path.exists(storePathSrt): |
||||
|
Logger.info("{}已存在", storePathSrt) |
||||
|
return |
||||
|
try: |
||||
|
# 下载字幕文件,并复制 |
||||
|
videoSrt = YouTubeTranscriptApi.get_transcript( |
||||
|
videoId, languages=[video.videoLanguage]) |
||||
|
srt_formatted = DownloadUtil.formatter.format_transcript(videoSrt) |
||||
|
Logger.info("文件地址...{}".format(storePathSrt)) |
||||
|
with open(storePathSrt, 'w', encoding='utf-8') as srt_file: |
||||
|
srt_file.write(srt_formatted) |
||||
|
Logger.info("下载完成...{}".format(videoId)) |
||||
|
copyfile(storePathSrt, cpPathSrt) |
||||
|
|
||||
|
# 修改video数据 |
||||
|
VideoService.upIsDownloadByVideoId(videoId, 1) |
||||
|
# 修改downloadInfo |
||||
|
downloadInfo: DownloadInfo = DownloadInfoService.getOneByVideoId( |
||||
|
videoId=videoId) |
||||
|
if downloadInfo is not None: |
||||
|
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 1) |
||||
|
DownloadUtil.iterateSrt(storePath, videoId, video.channelId) |
||||
|
pass |
||||
|
except Exception as e: |
||||
|
Logger.error(e) |
||||
|
logStr = "Exception...{}".format(e) |
||||
|
Logger.error(logStr) |
||||
|
# 修改downloadInfo,tryTime + 1 |
||||
|
downloadInfo: DownloadInfo = DownloadInfoService.getOneByVideoId( |
||||
|
videoId=videoId) |
||||
|
if operator.contains(logStr, "No transcripts"): |
||||
|
Logger.error("VideoId:{},不存在字幕文件".format(videoId)) |
||||
|
# 下载音频文件 |
||||
|
DownloadUtil.downLoadMP3(videoId, storePath) |
||||
|
# 更新下载任务 |
||||
|
if downloadInfo is not None: |
||||
|
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 1) |
||||
|
else: |
||||
|
if downloadInfo is not None: |
||||
|
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 0) |
@ -0,0 +1,46 @@ |
|||||
|
import time |
||||
|
from LoggerUtils import Logger, initLogger |
||||
|
from bs4 import BeautifulSoup as bs |
||||
|
from urllib.request import urlopen, Request |
||||
|
import json |
||||
|
import Contant |
||||
|
from sqlalchemy import create_engine |
||||
|
from entity.DownloadInfoEntity import DownloadInfo |
||||
|
from service.DownloadInfoService import DownloadInfoService |
||||
|
from common.YoutubeUtils import YouTubeUtil |
||||
|
import operator |
||||
|
import argparse |
||||
|
|
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
# 读取配置文件 |
||||
|
with open('download_video_config.json', 'r', encoding='utf-8') as f: |
||||
|
# 使用json.load()方法读取文件内容 |
||||
|
data = json.load(f) |
||||
|
|
||||
|
# 初始化日志 |
||||
|
Contant.logDir = data['log']['dir'] |
||||
|
Contant.logFileName = data['log']['fileName'] |
||||
|
initLogger(Contant.logDir, Contant.logFileName) |
||||
|
|
||||
|
# 连接mysql |
||||
|
dbHost = data['mysql']['host'] |
||||
|
dbPort = data['mysql']['port'] |
||||
|
dbUserName = data['mysql']['username'] |
||||
|
dbPassword = data['mysql']['password'] |
||||
|
dbDatabase = data['mysql']['database'] |
||||
|
Logger.info("尝试连接mysql host:'{}' port:'{}' username:'{}' password:'{}' database:'{}'", |
||||
|
dbHost, dbPort, dbUserName, dbPassword, dbDatabase) |
||||
|
Contant.engin = create_engine( |
||||
|
f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}') |
||||
|
Logger.info("连接mysql成功") |
||||
|
|
||||
|
# 获取一个未下载的video |
||||
|
downloadInfo: DownloadInfo = DownloadInfoService.getOneNoFinish() |
||||
|
if downloadInfo != None: |
||||
|
Logger.info("开始下载videoId:{}".format(downloadInfo.videoId)) |
||||
|
# TODO 下载字幕文件或音频文件 |
||||
|
# 重新获取下一个下载任务 |
||||
|
downloadInfo: DownloadInfo = DownloadInfoService.getOneNoFinish() |
||||
|
else: |
||||
|
Logger.info("完成下载") |
@ -0,0 +1,13 @@ |
|||||
|
{ |
||||
|
"mysql": { |
||||
|
"host": "47.108.20.249", |
||||
|
"port": "3306", |
||||
|
"username": "root", |
||||
|
"password": "casino888!", |
||||
|
"database": "youtube" |
||||
|
}, |
||||
|
"log": { |
||||
|
"dir": "./logs", |
||||
|
"fileName": "download_video" |
||||
|
} |
||||
|
} |
@ -0,0 +1,15 @@ |
|||||
|
from sqlalchemy import Column, Integer, String, Boolean, create_engine |
||||
|
from sqlalchemy.ext.declarative import declarative_base |
||||
|
|
||||
|
Base = declarative_base() |
||||
|
|
||||
|
|
||||
|
class DownloadInfo(Base): |
||||
|
__tablename__ = 'Download_info' |
||||
|
|
||||
|
id = Column(Integer, primary_key=True, autoincrement=True) |
||||
|
videoId = Column(String(255), nullable=False) |
||||
|
downloadType = Column(Integer, nullable=False) |
||||
|
tryTime = Column(Integer, nullable=False) |
||||
|
isFinished = Column(Integer, nullable=False) |
||||
|
|
@ -0,0 +1,17 @@ |
|||||
|
from sqlalchemy import Column, Integer, String, create_engine |
||||
|
from sqlalchemy.ext.declarative import declarative_base |
||||
|
|
||||
|
Base = declarative_base() |
||||
|
|
||||
|
|
||||
|
class Srtfile(Base): |
||||
|
__tablename__ = 'Srtfile' |
||||
|
|
||||
|
id = Column(Integer, primary_key=True, autoincrement=True) |
||||
|
videoId = Column(String(255), nullable=False) |
||||
|
channelId = Column(String(255), nullable=False) |
||||
|
ordinal = Column(Integer, nullable=False) |
||||
|
srtStartTime = Column(String(255), nullable=False) |
||||
|
srtEndTime = Column(String(255), nullable=False) # 同上 |
||||
|
srtText = Column(String(255), nullable=False) # 如果文本可能很长,考虑增加长度 |
||||
|
isScan = Column(Integer, nullable=True) # 允许NULL值 |
@ -0,0 +1,28 @@ |
|||||
|
from entity.DownloadInfoEntity import DownloadInfo |
||||
|
from common.Utils import getSession |
||||
|
from sqlalchemy import update |
||||
|
|
||||
|
|
||||
|
class DownloadInfoService: |
||||
|
|
||||
|
def getOneNoFinish(): |
||||
|
session = getSession() |
||||
|
downloadInfo = session.query(DownloadInfo).filter( |
||||
|
DownloadInfo.isFinished == 0, DownloadInfo.tryTime <= 2).first() |
||||
|
session.close() |
||||
|
return downloadInfo |
||||
|
|
||||
|
def getOneByVideoId(videoId): |
||||
|
session = getSession() |
||||
|
downloadInfo = session.query(DownloadInfo).filter( |
||||
|
DownloadInfo.videoId == videoId).one_or_none() |
||||
|
session.close() |
||||
|
return downloadInfo |
||||
|
|
||||
|
def updateIsFinishByVideoId(videoId, tryTime, isFinish): |
||||
|
session = getSession() |
||||
|
updateSql = update(DownloadInfo).where( |
||||
|
DownloadInfo.videoId == videoId).values(tryTime=tryTime, isFinish=isFinish) |
||||
|
resutl = session.execute(updateSql) |
||||
|
session.commit() |
||||
|
session.close() |
@ -0,0 +1,23 @@ |
|||||
|
from entity.SrtFileEntity import Srtfile |
||||
|
from common.Utils import getSession |
||||
|
from sqlalchemy import update |
||||
|
|
||||
|
|
||||
|
class SrtFileService: |
||||
|
def checkExistsByVideoId(videoid): |
||||
|
session = getSession() |
||||
|
srtFile: Srtfile = session.query(Srtfile).filter( |
||||
|
Srtfile.videoId == videoid).first() |
||||
|
session.close() |
||||
|
if srtFile is not None: |
||||
|
return True |
||||
|
else: |
||||
|
return False |
||||
|
|
||||
|
def insertOne(videoId, channelId, ordinal, srtStartTime, srtEndTime, srtText, isScan): |
||||
|
session = getSession() |
||||
|
srtFile: Srtfile = Srtfile(videoId=videoId, channelId=channelId, ordinal=ordinal, srtStartTime=srtStartTime, |
||||
|
srtEndTime=srtEndTime, srtText=srtText, isScan=isScan) |
||||
|
session.add(srtFile) |
||||
|
session.commit() |
||||
|
session.close() |
Loading…
Reference in new issue