10 changed files with 281 additions and 2 deletions
@ -0,0 +1,122 @@ |
|||
from shutil import copyfile |
|||
from youtube_transcript_api import YouTubeTranscriptApi |
|||
from youtube_transcript_api.formatters import SRTFormatter |
|||
from LoggerUtils import Logger |
|||
import time |
|||
import os |
|||
from func_timeout import func_set_timeout |
|||
import operator |
|||
import pysrt |
|||
from pytubefix import YouTube |
|||
from pytubefix.cli import on_progress |
|||
from entity.VideoEntity import Video |
|||
from entity.ChannelEntity import Channel |
|||
from entity.DownloadInfoEntity import DownloadInfo |
|||
from entity.SrtFileEntity import Srtfile |
|||
from service.VideoService import VideoService |
|||
from service.ChannelService import ChannelService |
|||
from service.DownloadInfoService import DownloadInfoService |
|||
from service.SrtFileService import SrtFileService |
|||
|
|||
|
|||
class DownloadUtil: |
|||
formatter = SRTFormatter() |
|||
# proxies = {"http": "http://127.0.0.1:7890", |
|||
# "https": "https://127.0.0.1:7890"} |
|||
|
|||
def iterateSrt(srtFilePath, videoId, channelId): |
|||
# 查询是否存在 |
|||
if SrtFileService.checkExistsByVideoId(videoId): |
|||
Logger.info("VideoId: {} 已收录", videoId) |
|||
return |
|||
subs = pysrt.open(srtFilePath) |
|||
ordinal = 1 |
|||
for sub in subs: |
|||
srtStartTime = str(sub.start.to_time()).rstrip("0") |
|||
srtEndTime = str(sub.end.to_time()).rstrip("0") |
|||
SrtFileService.insertOne(videoId=videoId, channelId=channelId, ordinal=ordinal, |
|||
srtStartTime=srtStartTime, srtEndTime=srtEndTime, srtText=sub.text, isScan=0) |
|||
ordinal = ordinal + 1 |
|||
|
|||
def downLoadMP3(videoId, storePath): |
|||
video:Video = VideoService.getOneByVideoId(videoId) |
|||
channel:Channel = ChannelService.queryOneByChannelId(video.channelId) |
|||
videoUrl = "https://www.youtube.com/watch?v={}".format(videoId) |
|||
yt = YouTube(videoUrl, on_progress_callback=on_progress) |
|||
ys = yt.streams.get_audio_only() |
|||
mp3OutPutPath = storePath |
|||
if not os.path.exists(mp3OutPutPath): |
|||
Logger.info("开始创建文件夹:" + mp3OutPutPath) |
|||
os.makedirs(mp3OutPutPath) |
|||
fileName = "{}.mp3".format(videoId) |
|||
ys.download(output_path=mp3OutPutPath, filename=fileName, mp3=True) |
|||
|
|||
@func_set_timeout(60) |
|||
def downloadOne(videoId): |
|||
video: Video = VideoService.queryOneByVideoId(videoId=videoId) |
|||
channel: Channel = ChannelService.queryOneByChannelId(video.channelId) |
|||
# 开始下载 |
|||
Logger.info("开始下载...{}".format(videoId)) |
|||
# 配置下载地址 /mnt/youtube_mysql |
|||
mainPath = "/mnt/youtube_mysql/srt/main" |
|||
tmpPath = "/mnt/youtube_mysql/srt/tmp" |
|||
# TODO 删除测试代码 |
|||
mainPath = "E:/code/tmp/main" |
|||
tmpPath = "E:/code/tmp/tmp" |
|||
|
|||
storePath = "{}/{}/{}-{}".format( |
|||
mainPath, channel.region, channel.channelId, channel.channelTitle) |
|||
cpPath = "{}/{}/{}-{}".format( |
|||
tmpPath, channel.region, channel.channelId, channel.channelTitle) |
|||
if not os.path.exists(storePath): |
|||
Logger.info("开始创建文件夹:" + storePath) |
|||
os.makedirs(storePath) |
|||
if not os.path.exists(cpPath): |
|||
Logger.info("开始创建文件夹:" + cpPath) |
|||
os.makedirs(cpPath) |
|||
|
|||
# 获取字幕文件名 |
|||
storePathSrt = "{}/{}.srt".format(storePath, videoId) |
|||
cpPathSrt = "{}/{}.srt".format(cpPath, videoId) |
|||
|
|||
# 判断文件是否存在 |
|||
if os.path.exists(storePathSrt): |
|||
Logger.info("{}已存在", storePathSrt) |
|||
return |
|||
try: |
|||
# 下载字幕文件,并复制 |
|||
videoSrt = YouTubeTranscriptApi.get_transcript( |
|||
videoId, languages=[video.videoLanguage]) |
|||
srt_formatted = DownloadUtil.formatter.format_transcript(videoSrt) |
|||
Logger.info("文件地址...{}".format(storePathSrt)) |
|||
with open(storePathSrt, 'w', encoding='utf-8') as srt_file: |
|||
srt_file.write(srt_formatted) |
|||
Logger.info("下载完成...{}".format(videoId)) |
|||
copyfile(storePathSrt, cpPathSrt) |
|||
|
|||
# 修改video数据 |
|||
VideoService.upIsDownloadByVideoId(videoId, 1) |
|||
# 修改downloadInfo |
|||
downloadInfo: DownloadInfo = DownloadInfoService.getOneByVideoId( |
|||
videoId=videoId) |
|||
if downloadInfo is not None: |
|||
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 1) |
|||
DownloadUtil.iterateSrt(storePath, videoId, video.channelId) |
|||
pass |
|||
except Exception as e: |
|||
Logger.error(e) |
|||
logStr = "Exception...{}".format(e) |
|||
Logger.error(logStr) |
|||
# 修改downloadInfo,tryTime + 1 |
|||
downloadInfo: DownloadInfo = DownloadInfoService.getOneByVideoId( |
|||
videoId=videoId) |
|||
if operator.contains(logStr, "No transcripts"): |
|||
Logger.error("VideoId:{},不存在字幕文件".format(videoId)) |
|||
# 下载音频文件 |
|||
DownloadUtil.downLoadMP3(videoId, storePath) |
|||
# 更新下载任务 |
|||
if downloadInfo is not None: |
|||
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 1) |
|||
else: |
|||
if downloadInfo is not None: |
|||
DownloadInfoService.updateIsFinishByVideoId(videoId, 1, 0) |
@ -0,0 +1,46 @@ |
|||
import time |
|||
from LoggerUtils import Logger, initLogger |
|||
from bs4 import BeautifulSoup as bs |
|||
from urllib.request import urlopen, Request |
|||
import json |
|||
import Contant |
|||
from sqlalchemy import create_engine |
|||
from entity.DownloadInfoEntity import DownloadInfo |
|||
from service.DownloadInfoService import DownloadInfoService |
|||
from common.YoutubeUtils import YouTubeUtil |
|||
import operator |
|||
import argparse |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
# 读取配置文件 |
|||
with open('download_video_config.json', 'r', encoding='utf-8') as f: |
|||
# 使用json.load()方法读取文件内容 |
|||
data = json.load(f) |
|||
|
|||
# 初始化日志 |
|||
Contant.logDir = data['log']['dir'] |
|||
Contant.logFileName = data['log']['fileName'] |
|||
initLogger(Contant.logDir, Contant.logFileName) |
|||
|
|||
# 连接mysql |
|||
dbHost = data['mysql']['host'] |
|||
dbPort = data['mysql']['port'] |
|||
dbUserName = data['mysql']['username'] |
|||
dbPassword = data['mysql']['password'] |
|||
dbDatabase = data['mysql']['database'] |
|||
Logger.info("尝试连接mysql host:'{}' port:'{}' username:'{}' password:'{}' database:'{}'", |
|||
dbHost, dbPort, dbUserName, dbPassword, dbDatabase) |
|||
Contant.engin = create_engine( |
|||
f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}') |
|||
Logger.info("连接mysql成功") |
|||
|
|||
# 获取一个未下载的video |
|||
downloadInfo: DownloadInfo = DownloadInfoService.getOneNoFinish() |
|||
if downloadInfo != None: |
|||
Logger.info("开始下载videoId:{}".format(downloadInfo.videoId)) |
|||
# TODO 下载字幕文件或音频文件 |
|||
# 重新获取下一个下载任务 |
|||
downloadInfo: DownloadInfo = DownloadInfoService.getOneNoFinish() |
|||
else: |
|||
Logger.info("完成下载") |
@ -0,0 +1,13 @@ |
|||
{ |
|||
"mysql": { |
|||
"host": "47.108.20.249", |
|||
"port": "3306", |
|||
"username": "root", |
|||
"password": "casino888!", |
|||
"database": "youtube" |
|||
}, |
|||
"log": { |
|||
"dir": "./logs", |
|||
"fileName": "download_video" |
|||
} |
|||
} |
@ -0,0 +1,15 @@ |
|||
from sqlalchemy import Column, Integer, String, Boolean, create_engine |
|||
from sqlalchemy.ext.declarative import declarative_base |
|||
|
|||
Base = declarative_base() |
|||
|
|||
|
|||
class DownloadInfo(Base): |
|||
__tablename__ = 'Download_info' |
|||
|
|||
id = Column(Integer, primary_key=True, autoincrement=True) |
|||
videoId = Column(String(255), nullable=False) |
|||
downloadType = Column(Integer, nullable=False) |
|||
tryTime = Column(Integer, nullable=False) |
|||
isFinished = Column(Integer, nullable=False) |
|||
|
@ -0,0 +1,17 @@ |
|||
from sqlalchemy import Column, Integer, String, create_engine |
|||
from sqlalchemy.ext.declarative import declarative_base |
|||
|
|||
Base = declarative_base() |
|||
|
|||
|
|||
class Srtfile(Base): |
|||
__tablename__ = 'Srtfile' |
|||
|
|||
id = Column(Integer, primary_key=True, autoincrement=True) |
|||
videoId = Column(String(255), nullable=False) |
|||
channelId = Column(String(255), nullable=False) |
|||
ordinal = Column(Integer, nullable=False) |
|||
srtStartTime = Column(String(255), nullable=False) |
|||
srtEndTime = Column(String(255), nullable=False) # 同上 |
|||
srtText = Column(String(255), nullable=False) # 如果文本可能很长,考虑增加长度 |
|||
isScan = Column(Integer, nullable=True) # 允许NULL值 |
@ -0,0 +1,28 @@ |
|||
from entity.DownloadInfoEntity import DownloadInfo |
|||
from common.Utils import getSession |
|||
from sqlalchemy import update |
|||
|
|||
|
|||
class DownloadInfoService: |
|||
|
|||
def getOneNoFinish(): |
|||
session = getSession() |
|||
downloadInfo = session.query(DownloadInfo).filter( |
|||
DownloadInfo.isFinished == 0, DownloadInfo.tryTime <= 2).first() |
|||
session.close() |
|||
return downloadInfo |
|||
|
|||
def getOneByVideoId(videoId): |
|||
session = getSession() |
|||
downloadInfo = session.query(DownloadInfo).filter( |
|||
DownloadInfo.videoId == videoId).one_or_none() |
|||
session.close() |
|||
return downloadInfo |
|||
|
|||
def updateIsFinishByVideoId(videoId, tryTime, isFinish): |
|||
session = getSession() |
|||
updateSql = update(DownloadInfo).where( |
|||
DownloadInfo.videoId == videoId).values(tryTime=tryTime, isFinish=isFinish) |
|||
resutl = session.execute(updateSql) |
|||
session.commit() |
|||
session.close() |
@ -0,0 +1,23 @@ |
|||
from entity.SrtFileEntity import Srtfile |
|||
from common.Utils import getSession |
|||
from sqlalchemy import update |
|||
|
|||
|
|||
class SrtFileService: |
|||
def checkExistsByVideoId(videoid): |
|||
session = getSession() |
|||
srtFile: Srtfile = session.query(Srtfile).filter( |
|||
Srtfile.videoId == videoid).first() |
|||
session.close() |
|||
if srtFile is not None: |
|||
return True |
|||
else: |
|||
return False |
|||
|
|||
def insertOne(videoId, channelId, ordinal, srtStartTime, srtEndTime, srtText, isScan): |
|||
session = getSession() |
|||
srtFile: Srtfile = Srtfile(videoId=videoId, channelId=channelId, ordinal=ordinal, srtStartTime=srtStartTime, |
|||
srtEndTime=srtEndTime, srtText=srtText, isScan=isScan) |
|||
session.add(srtFile) |
|||
session.commit() |
|||
session.close() |
Loading…
Reference in new issue