diff --git a/README.md b/README.md index 4c9b4ad..5765423 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,3 @@ -# youtube_prod +# youtube_srt +Youtube字幕项目 \ No newline at end of file diff --git a/db/youtube_prod.db b/db/youtube_prod.db new file mode 100644 index 0000000..2d8ede7 Binary files /dev/null and b/db/youtube_prod.db differ diff --git a/db/youtube_prod_bak.db b/db/youtube_prod_bak.db new file mode 100644 index 0000000..838b50e Binary files /dev/null and b/db/youtube_prod_bak.db differ diff --git a/download/ChannelService.py b/download/ChannelService.py new file mode 100644 index 0000000..362cb9d --- /dev/null +++ b/download/ChannelService.py @@ -0,0 +1,7 @@ +import json +from Orm import Channel +from playhouse.shortcuts import model_to_dict, dict_to_model + +class ChannelService: + def getOneByChannelId(channelId): + return Channel.get_or_none(Channel.channelId == channelId) \ No newline at end of file diff --git a/download/Contant.py b/download/Contant.py new file mode 100644 index 0000000..b4b3bfa --- /dev/null +++ b/download/Contant.py @@ -0,0 +1,2 @@ +db="" +logDir="" \ No newline at end of file diff --git a/download/DownloadInfoService.py b/download/DownloadInfoService.py new file mode 100644 index 0000000..d01472a --- /dev/null +++ b/download/DownloadInfoService.py @@ -0,0 +1,28 @@ +from Orm import DownloadInfo + + +class DownloadService: + def getOneByVideoId(videoId, downloadType): + return DownloadInfo.get(DownloadInfo.videoId == videoId, DownloadInfo.downloadType == downloadType) + + def createOne(videoId, downloadType, tryTime, isFinished): + DownloadInfo.create( + videoId=videoId, + downloadType=downloadType, + tryTime=tryTime, + isFinished=isFinished + ) + + def updateInfoByVideoId(videoId, tryTime, isFinished, downloadType): + DownloadInfo.update(tryTime=tryTime, isFinished=isFinished).where( + DownloadInfo.videoId == videoId, DownloadInfo.downloadType == downloadType).execute() + + def findNotFinishList(): + return DownloadInfo.select().where(DownloadInfo.isFinished == 0, DownloadInfo.tryTime <= 5, DownloadInfo.downloadType == 1).limit(10).execute() + + def changeDownloadType(videoId, tryTime, isFinished, downloadType, changeType): + DownloadInfo.update(tryTime=tryTime, isFinished=isFinished, downloadType=changeType).where( + DownloadInfo.videoId == videoId, DownloadInfo.downloadType == downloadType).execute() + + def findNotFinishListTwo(): + return DownloadInfo.select().where(DownloadInfo.isFinished == 0, DownloadInfo.tryTime <= 5, DownloadInfo.downloadType == 2).limit(10).execute() diff --git a/download/DownloadUtil.py b/download/DownloadUtil.py new file mode 100644 index 0000000..504cf8b --- /dev/null +++ b/download/DownloadUtil.py @@ -0,0 +1,172 @@ +from shutil import copyfile +from youtube_transcript_api import YouTubeTranscriptApi +from youtube_transcript_api.formatters import SRTFormatter +from VideoService import VideoService +from ChannelService import ChannelService +from DownloadInfoService import DownloadService +from LoggerUtils import Logger +import time +import os +from func_timeout import func_set_timeout +import operator + + +class DownLoadUtil: + + formatter = SRTFormatter() + proxies = {"http": "http://127.0.0.1:7890", + "https": "https://127.0.0.1:7890"} + + @func_set_timeout(60) + def downloadOne(videoId): + # 获取数据 + video = VideoService.getOneByVideoId(videoId) + channel = ChannelService.getOneByChannelId(str(video.channelId)) + # 格式化title + videoTitle = str(video.videoTitle) + videoTitle = str(videoTitle).replace("/", u"\u2215") + videoTitle = str(videoTitle).replace("?", "?") + videoTitle = str(videoTitle).replace("\\", "") + videoTitle = str(videoTitle).replace("|", "") + videoTitle = str(videoTitle).replace("<", "") + videoTitle = str(videoTitle).replace(">", "") + videoTitle = str(videoTitle).replace(":", "") + # 获取发布时间 + videoPublishTime = str(video.videoPublishTime) + videoPublishTime = str(videoPublishTime).split("T")[0] + # 开始下载 + Logger.info("开始下载...{}".format(videoId)) + cpPath = "" + try: + # 获取字幕 + languages = str(video.videoLanguage) + storePath = "/mnt/srt_file/" + str(channel.channelTitle) + cpPath = "/mnt/tmp_srt_file/" + str(channel.channelTitle) + if not os.path.exists(storePath): + Logger.info("开始创建文件夹:" + storePath) + os.makedirs(storePath) + if not os.path.exists(cpPath): + Logger.info("开始创建文件夹:" + cpPath) + os.makedirs(cpPath) + storePath = storePath + "/" + videoPublishTime + \ + "-" + languages + "-" + videoTitle + ".srt" + cpPath = cpPath + "/" + videoPublishTime + \ + "-" + languages + "-" + videoTitle + ".srt" + if len(cpPath) > 120: + storePath = storePath[:-20] + ".srt" + cpPath = cpPath[:-20] + ".srt" + videoSrt = YouTubeTranscriptApi.get_transcript( + videoId, languages=[languages]) + srt_formatted = DownLoadUtil.formatter.format_transcript(videoSrt) + Logger.info("文件地址...{}".format(storePath)) + with open(storePath, 'w', encoding='utf-8') as srt_file: + srt_file.write(srt_formatted) + Logger.info("下载完成...{}".format(videoId)) + copyfile(storePath, cpPath) + # 修改video数据 + VideoService.updateIsDownloadByVideoId(videoId, 1) + # 修改downloadInfo + downloadInfo = DownloadService.getOneByVideoId(videoId, 1) + if downloadInfo is not None: + DownloadService.updateInfoByVideoId( + videoId, downloadInfo.tryTime + 1, 1, 1) + except Exception as e: + Logger.error("下载失败...{}".format(videoId)) + logStr = "Exception...{}".format(e) + Logger.error(logStr) + downloadInfo = DownloadService.getOneByVideoId(videoId, 1) + if operator.contains(logStr, "No transcripts"): + Logger.error("VideoId:{},不存在字幕文件".format(videoId)) + if downloadInfo is not None: + DownloadService.changeDownloadType( + videoId, 0, 0, 1, 2) + elif operator.contains(logStr, "File name too long"): + # 文件名过长 + languages = str(video.videoLanguage) + videoSrt = YouTubeTranscriptApi.get_transcript( + videoId, languages=[languages]) + srt_formatted = DownLoadUtil.formatter.format_transcript(videoSrt) + storePath = "/mnt/srt_file/" + str(channel.channelTitle) + "/" + \ + videoPublishTime + "-" + languages + "-" + videoId + ".srt" + cpPath = "/mnt/tmp_srt_file/" + str(channel.channelTitle) + "/" + \ + videoPublishTime + "-" + languages + "-" + videoId + ".srt" + if len(cpPath) > 120: + storePath = storePath[:-20] + ".srt" + cpPath = cpPath[:-20] + ".srt" + Logger.info("文件名过长,文件地址...{}".format(storePath)) + with open(storePath, 'w', encoding='utf-8') as srt_file: + srt_file.write(srt_formatted) + Logger.info("下载完成...{}".format(videoId)) + copyfile(storePath, cpPath) + # 修改video数据 + VideoService.updateIsDownloadByVideoId(videoId, 1) + # 修改downloadInfo + downloadInfo = DownloadService.getOneByVideoId(videoId, 1) + if downloadInfo is not None: + DownloadService.updateInfoByVideoId( + videoId, downloadInfo.tryTime + 1, 1, 1) + else: + if downloadInfo is not None: + Logger.info("VideoId:{}开始重试第{}次".format( + videoId, downloadInfo.tryTime + 1)) + DownloadService.updateInfoByVideoId( + videoId, downloadInfo.tryTime + 1, 0, 1) + + @func_set_timeout(60) + def downloadTwo(videoId): + # 获取数据 + video = VideoService.getOneByVideoId(videoId, 2) + channel = ChannelService.getOneByChannelId(str(video.channelId)) + # 格式化title + videoTitle = str(video.videoTitle) + videoTitle = str(videoTitle).replace("/", u"\u2215") + videoTitle = str(videoTitle).replace("?", "?") + videoTitle = str(videoTitle).replace("\\", "") + videoTitle = str(videoTitle).replace("|", "") + videoTitle = str(videoTitle).replace("<", "") + videoTitle = str(videoTitle).replace(">", "") + videoTitle = str(videoTitle).replace(":", "") + # 获取发布时间 + videoPublishTime = str(video.videoPublishTime) + videoPublishTime = str(videoPublishTime).split("T")[0] + # 开始下载 + Logger.info("开始下载...{}".format(videoId)) + try: + # 获取字幕 + languages = str(video.videoLanguage) + storePath = "./download/" + str(channel.channelTitle) + if not os.path.exists(storePath): + Logger.info("开始创建文件夹:" + storePath) + os.makedirs(storePath) + storePath = storePath + "\\" + videoPublishTime + \ + "-" + languages + "-" + videoTitle + ".srt" + videoSrt = YouTubeTranscriptApi.get_transcript( + videoId, languages=[languages]) + srt_formatted = DownLoadUtil.formatter.format_transcript(videoSrt) + Logger.info("文件地址...{}".format(storePath)) + with open(storePath, 'w', encoding='utf-8') as srt_file: + srt_file.write(srt_formatted) + Logger.info("下载完成...{}".format(videoId)) + # 修改video数据 + VideoService.updateIsDownloadByVideoId(videoId, 1) + # 修改downloadInfo + downloadInfo = DownloadService.getOneByVideoId(videoId, 2) + if downloadInfo is not None: + DownloadService.updateInfoByVideoId( + videoId, downloadInfo.tryTime + 1, 1, 2) + except Exception as e: + Logger.error("下载失败...{}".format(videoId)) + logStr = "Exception...{}".format(e) + Logger.error(logStr) + downloadInfo = DownloadService.getOneByVideoId(videoId, 2) + if operator.contains(logStr, "No transcripts"): + Logger.error("VideoId:{},不存在字幕文件".format(videoId)) + if downloadInfo is not None: + DownloadService.changeDownloadType( + videoId, 6, 0, 2, 3) + else: + if downloadInfo is not None: + Logger.info("VideoId:{}开始重试第{}次".format( + videoId, downloadInfo.tryTime + 1)) + DownloadService.updateInfoByVideoId( + videoId, downloadInfo.tryTime + 1, 0, 2) diff --git a/download/LoggerUtils.py b/download/LoggerUtils.py new file mode 100644 index 0000000..b7cc4d6 --- /dev/null +++ b/download/LoggerUtils.py @@ -0,0 +1,6 @@ +from loguru import logger +import Contant +Logger = logger +def initLogger(): + logger.add(Contant.logDir+"/download_{time}.log", rotation="500MB", encoding="utf-8", + enqueue=True, compression="zip", retention="10 days") diff --git a/download/Orm.py b/download/Orm.py new file mode 100644 index 0000000..bb11760 --- /dev/null +++ b/download/Orm.py @@ -0,0 +1,65 @@ +from peewee import * +import Contant +import argparse +from LoggerUtils import Logger + +parser = argparse.ArgumentParser(description='') +parser.add_argument('--db', type=str, default='') +parser.add_argument('--logDir', type=str, default='') +args = parser.parse_args() +Contant.db = args.db +db = SqliteDatabase(Contant.db) + + +def ormInit(): + Channel.create_table() + Video.create_table() + DownloadInfo.create_table() + + +class BaseModel(Model): + class Meta: + database = db + +# 频道信息 + + +class Channel(BaseModel): + id = PrimaryKeyField() + channelId = CharField(null=False) + channelTitle = CharField(null=False) + channelLanguage = CharField() + channelReptileTime = CharField(null=True) + + class Meta: + db_table = 'Channel' + +# 视频信息 + + +class Video(BaseModel): + id = PrimaryKeyField() + videoId = CharField(null=False) + channelId = CharField(null=False) + videoTitle = CharField() + videoLen = IntegerField() + videoType = CharField() + videoPublishTime = CharField() + videoLanguage = CharField() + isDownload = IntegerField() + + class Meta: + db_table = 'Vidoes' + +# 下载信息 + + +class DownloadInfo(BaseModel): + id = PrimaryKeyField() + videoId = CharField() + downloadType = IntegerField() + tryTime = IntegerField() + isFinished = IntegerField() + + class Meta: + db_table = 'Download_info' diff --git a/download/VideoService.py b/download/VideoService.py new file mode 100644 index 0000000..e3fdaef --- /dev/null +++ b/download/VideoService.py @@ -0,0 +1,26 @@ +import json +from Orm import Video +from playhouse.shortcuts import model_to_dict, dict_to_model + + +class VideoService: + def getOneByVideoId(videoId): + return Video.get_or_none(Video.videoId == videoId) + + def createOne(videoId, channelId, videoTitle, videoLen, videoType, videoPublishTime, videoLanguage, isDownload): + Video.create(videoId=videoId, + channelId=channelId, + videoTitle=videoTitle, + videoLen=videoLen, + videoType=videoType, + videoPublishTime=videoPublishTime, + videoLanguage=videoLanguage, + isDownload=isDownload) + + def updateLenByVideoId(videoId, len): + Video.update(videoLen=len).where(Video.videoId == videoId).execute() + + def updateIsDownloadByVideoId(videoId, isDownload): + Video.update(isDownload=isDownload).where( + Video.videoId == videoId).execute() + \ No newline at end of file diff --git a/download/download.zip b/download/download.zip new file mode 100644 index 0000000..784d41c Binary files /dev/null and b/download/download.zip differ diff --git a/download/main_download.py b/download/main_download.py new file mode 100644 index 0000000..869229f --- /dev/null +++ b/download/main_download.py @@ -0,0 +1,49 @@ +import argparse +import random +import time +import Contant +from LoggerUtils import Logger, initLogger +import Orm +from VideoService import VideoService +from ChannelService import ChannelService +from DownloadInfoService import DownloadService +from DownloadUtil import DownLoadUtil +from func_timeout import func_set_timeout +import func_timeout +import requests + +# python3 ./main_download.py --db="../db/youtube_prod.db" --logDir="./logs" +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='') + parser.add_argument('--db', type=str, default='') + parser.add_argument('--logDir', type=str, default='') + args = parser.parse_args() + Contant.db = args.db + Contant.logDir = args.logDir + initLogger() + Orm.ormInit() + list = DownloadService.findNotFinishList() + Logger.info("list size:{}".format(len(list))) + while (len(list) > 0): + for info in list: + try: + DownLoadUtil.downloadOne(info.videoId) + restTime = random.randint(1, 3) + Logger.info("间隔{}秒后继续...".format(restTime)) + time.sleep(restTime) + except func_timeout.exceptions.FunctionTimedOut as e: + Logger.error("执行下载方法超时错误:{}".format(e)) + loopRestTime = random.randint(1, 3) + Logger.info("循环间隔{}秒后继续...".format(loopRestTime)) + time.sleep(loopRestTime) + list = DownloadService.findNotFinishList() + # 发送钉钉消息 + webhook = "https://oapi.dingtalk.com/robot/send?access_token=c8c8d7d42c4eecd449dd303025ef968f647d1d8e8694e3fabc0ab5770d646dcb" + jsonData = { + "msgtype": "text", + "text": { + "content": "[Youtube]download finished" + } + } + requests.post(webhook, json=jsonData) + Logger.info("download发送钉钉消息成功...") \ No newline at end of file diff --git a/init/Contant.py b/init/Contant.py new file mode 100644 index 0000000..b4b3bfa --- /dev/null +++ b/init/Contant.py @@ -0,0 +1,2 @@ +db="" +logDir="" \ No newline at end of file diff --git a/init/LoggerUtils.py b/init/LoggerUtils.py new file mode 100644 index 0000000..4913e92 --- /dev/null +++ b/init/LoggerUtils.py @@ -0,0 +1,6 @@ +from loguru import logger +import Contant +Logger = logger +def initLogger(): + logger.add(Contant.logDir+"/init_{time}.log", rotation="500MB", encoding="utf-8", + enqueue=True, compression="zip", retention="10 days") diff --git a/init/Orm.py b/init/Orm.py new file mode 100644 index 0000000..e672dcb --- /dev/null +++ b/init/Orm.py @@ -0,0 +1,65 @@ +from peewee import * +import Contant +import argparse +from LoggerUtils import Logger + +parser = argparse.ArgumentParser(description='') +parser.add_argument('--db', type=str, default='') +parser.add_argument('--logDir', type=str, default='') +args = parser.parse_args() +Contant.db = args.db +db = SqliteDatabase(Contant.db) + + +def ormInit(): + Channel.create_table() + Vidoe.create_table() + DownloadInfo.create_table() + + +class BaseModel(Model): + class Meta: + database = db + +# 频道信息 + + +class Channel(BaseModel): + id = PrimaryKeyField() + channelId = CharField(null=False) + channelTitle = CharField(null=False) + channelLanguage = CharField() + channelReptileTime = CharField(null=True) + + class Meta: + db_table = 'Channel' + +# 视频信息 + + +class Vidoe(BaseModel): + id = PrimaryKeyField() + videoId = CharField(null=False) + channelId = CharField(null=False) + videoTitle = CharField() + videoLen = IntegerField() + videoType = CharField() + videoPublishTime = CharField() + videoLanguage = CharField() + isDownload = IntegerField() + + class Meta: + db_table = 'Vidoes' + +# 下载信息 + + +class DownloadInfo(BaseModel): + id = PrimaryKeyField() + videoId = CharField() + downloadType = IntegerField() + tryTime = IntegerField() + isFinished = IntegerField() + + class Meta: + db_table = 'Download_info' diff --git a/init/init.py b/init/init.py new file mode 100644 index 0000000..25f3749 --- /dev/null +++ b/init/init.py @@ -0,0 +1,56 @@ +from LoggerUtils import Logger, initLogger +import argparse +import Contant +from Orm import ormInit, Channel +import operator +from bs4 import BeautifulSoup as bs +from urllib.request import urlopen, Request + +# py .\init.py --db=../db/youtube_prod.db --logDir=./logs +def saveChannel(channelUrl, language): + Logger.info("频道链接:"+channelUrl) + channelId = "" + channelName = "" + url_opener = urlopen( + Request(channelUrl, headers={'User-Agent': 'Mozilla'})) + videoInfo = bs(url_opener, features="html.parser") + links = videoInfo.find_all("link") + for link in links: + if operator.contains(str(link), "canonical"): + channelId = str(link['href']).split("/channel/")[1] + if operator.contains(str(link), "content="): + channelName = str(link['content']) + Logger.info("channelId:"+channelId) + Logger.info("channelName:"+channelName) + channel = Channel.get_or_none(Channel.channelId == channelId) + if channel != None: + Logger.info("频道已存在:" + channelId) + return + Channel.create(channelTitle=channelName, + channelId=channelId, channelLanguage=language) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='') + parser.add_argument('--db', type=str, default='') + parser.add_argument('--logDir', type=str, default='') + args = parser.parse_args() + Contant.db = args.db + Contant.logDir = args.logDir + initLogger() + ormInit() + Logger.info("SqlLite存放地址:"+Contant.db) + Logger.info("日志文件存放地址:"+Contant.logDir) + Logger.info("开始初始化...") + # checkInit() + # 读取txt文件获取需要的频道地址 + Logger.info("开始读取需要新增的频道地址...") + urlList = [] + # 打开文件 + for line in open("urlList.txt"): + line = line.strip('\n') + urlList.append(line) + language = urlList[0] + for url in urlList: + if len(url) > 10: + saveChannel(url, language) diff --git a/init/urlList.txt b/init/urlList.txt new file mode 100644 index 0000000..9f8bbfa --- /dev/null +++ b/init/urlList.txt @@ -0,0 +1,14 @@ +zh-TW +https://www.youtube.com/@TheStormMedia +https://www.youtube.com/@57ETFN +https://www.youtube.com/@MoneyNewWorld +https://www.youtube.com/@tvbsmoney +https://www.youtube.com/@TheMasterhsiao +https://www.youtube.com/@mvp5888 +https://www.youtube.com/@HUNG64 +https://www.youtube.com/@user-vc2vr6tw4h +https://www.youtube.com/ustv +https://www.youtube.com/@leon888 +https://www.youtube.com/@smartmonthly-BW +https://www.youtube.com/@ustvstockonline +https://www.youtube.com/@AASTOCKS_AATV \ No newline at end of file diff --git a/init/urlList_hi.txt b/init/urlList_hi.txt new file mode 100644 index 0000000..7cd2ab9 --- /dev/null +++ b/init/urlList_hi.txt @@ -0,0 +1,90 @@ +hi +https://www.youtube.com/@procapitalacademy +https://www.youtube.com/@TEACHERANISH +https://www.youtube.com/@MarketGurukul1 + + + +en +https://www.youtube.com/@VishalKhandelwalshow +https://www.youtube.com/@Elearnmarkets +https://www.youtube.com/@MarketsMojo +https://www.youtube.com/@TradeWithTrend +https://www.youtube.com/@SHAREKHAN +https://www.youtube.com/@AvadhutSatheTradingAcademy + + + +ko +https://www.youtube.com/@E_TREND +https://www.youtube.com/@hkwownet +https://www.youtube.com/@giant_tv +https://www.youtube.com/@StrongStock +https://www.youtube.com/@stockwar999 +https://www.youtube.com/@user-sp1du8pm6q +https://www.youtube.com/@talentinvestment +https://www.youtube.com/@future_economy +https://www.youtube.com/@user-sf7hm6xj8d +https://www.youtube.com/@user-xv9xi6pi9o +https://www.youtube.com/@user-rd8fd1xj9b +https://www.youtube.com/@lucky_tv +https://www.youtube.com/@Min_woo +https://www.youtube.com/@taver1123 +https://www.youtube.com/@Super0Min +https://www.youtube.com/@ap5798 +https://www.youtube.com/@drematree100 +https://www.youtube.com/@MKeconomy_TV +https://www.youtube.com/@grit +https://www.youtube.com/@user-zn9js9fg5i +https://www.youtube.com/@youngikkim +https://www.youtube.com/@DonNawa +https://www.youtube.com/@woong-dal +https://www.youtube.com/@johnleeschool +https://www.youtube.com/@syukaworld-comics +https://www.youtube.com/@channelA-news +https://www.youtube.com/@user-bh7lr7pe9g +https://www.youtube.com/@singlefire +https://www.youtube.com/@moneyhi +https://www.youtube.com/@top.trader +https://www.youtube.com/@jusikdante + + + +zh-TW +https://www.youtube.com/@kukantieh + +ja +https://www.youtube.com/@DanTakahashi1 +https://www.youtube.com/@tvtokyobiz +https://www.youtube.com/@SHO1112 +https://www.youtube.com/@pivot8935 +https://www.youtube.com/@nikkei +https://www.youtube.com/@toushikomon +https://www.youtube.com/@pivot8935 +https://www.youtube.com/@NewsPicks/featured +https://www.youtube.com/@higedura24 +https://www.youtube.com/@tvtokyobiz +https://www.youtube.com/@omaegaowattendayo +https://www.youtube.com/@info_ask1 +https://www.youtube.com/@takaisanno/videos +https://www.youtube.com/@takaponjp +https://www.youtube.com/@tbsnewsdig +https://www.youtube.com/@rehacq +https://www.youtube.com/@mabuchi-mariko +https://www.youtube.com/@fp_nigu +https://www.youtube.com/@yukkuri-money +https://www.youtube.com/@SHO1112 +https://www.youtube.com/@yohei-chokin +https://www.youtube.com/@user-yu9sj9gq7z/videos +https://www.youtube.com/@tesuta-clipping +https://www.youtube.com/@tradelabo2222 +https://www.youtube.com/@jin115xx +https://www.youtube.com/@higedura24 +https://www.youtube.com/@nobujuku +https://www.youtube.com/@tokyosoken +https://www.youtube.com/@user-hx7bn7hp9v +https://www.youtube.com/@SLokRE +https://www.youtube.com/@rehacq +https://www.youtube.com/@moha-p +https://www.youtube.com/results?search_query=Buffett+Taro%27s +https://www.youtube.com/@Gorikoro diff --git a/init/urlList_ja.txt b/init/urlList_ja.txt new file mode 100644 index 0000000..d8a06a4 --- /dev/null +++ b/init/urlList_ja.txt @@ -0,0 +1,7 @@ +ja +https://www.youtube.com/@ryogakucho +https://www.youtube.com/@DanTakahashi1 +https://www.youtube.com/@buffett_taro +https://www.youtube.com/@Tsubame104 +https://www.youtube.com/@inc_academy +https://www.youtube.com/@kamioka01 \ No newline at end of file diff --git a/sftp/Contant.py b/sftp/Contant.py new file mode 100644 index 0000000..b5f473a --- /dev/null +++ b/sftp/Contant.py @@ -0,0 +1 @@ +logDir="" \ No newline at end of file diff --git a/sftp/LoggerUtils.py b/sftp/LoggerUtils.py new file mode 100644 index 0000000..388b853 --- /dev/null +++ b/sftp/LoggerUtils.py @@ -0,0 +1,6 @@ +from loguru import logger +import Contant +Logger = logger +def initLogger(): + logger.add(Contant.logDir+"/sftp_{time}.log", rotation="500MB", encoding="utf-8", + enqueue=True, compression="zip", retention="10 days") diff --git a/sftp/sftp.py b/sftp/sftp.py new file mode 100644 index 0000000..b22761a --- /dev/null +++ b/sftp/sftp.py @@ -0,0 +1,97 @@ +import os +import shutil +import paramiko +import argparse +import Contant +from LoggerUtils import Logger, initLogger +import configparser +import requests +import time + +# python3 sftp.py --local="/mnt/tmp_srt_file" --logDir="./logs" +# python3 sftp.py --local="/mnt/test_file" --logDir="./logs" +if __name__ == "__main__": + # 读取参数 + parser = argparse.ArgumentParser(description="") + parser.add_argument("--local", type=str, default="") + parser.add_argument('--logDir', type=str, default='') + args = parser.parse_args() + Contant.logDir = args.logDir + initLogger() + + # 读取配置文件 + config = configparser.ConfigParser() + config.read('sftp_config.ini') + + # 获取SFTP配置信息 + hostname = config.get('sftp_config', 'hostname') + port = config.getint('sftp_config', 'port') + username = config.get('sftp_config', 'username') + password = config.get('sftp_config', 'password') + + Logger.info("host:{},port:{},username:{},password:{}".format( + hostname, port, username, password)) + + ssh_client = paramiko.SSHClient() + ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + sftp_client = None # 设置默认值 + ssh_client.connect(hostname, port, username, password) + # 创建SFTP客户端 + sftp_client = ssh_client.open_sftp() + Logger.info("SFTP客户端已经建立:{}".format(sftp_client)) + + remote_root = "/Inbound/YouTube Captions" + local_root = args.local + Logger.info("remote_root:{},local_root:{}".format(remote_root, local_root)) + + names = os.listdir(local_root) + for name in names: + # sftp创建文件夹 + try: + sftp_client.chdir(remote_root + "/" + name) + except BaseException: + sftp_client.mkdir(remote_root + "/" + name) + sftp_client.chdir(remote_root + "/" + name) + + # 遍历本地临时文件夹 + srtList = os.listdir(local_root + "/" + name) + for srt in srtList: + # 获取远程文件路径以及本地文件路径 + remotePath = remote_root + "/" + name + "/" + srt + localPath = local_root + "/" + name + "/" + srt + # 如果远程文件存在,则进行删除 + try: + sftp_client.stat(remotePath) + # 如果文件存在,删除它 + sftp_client.remove(remotePath) + Logger.info("Remote file '{}' deleted.".format(remotePath)) + except FileNotFoundError: + Logger.info("Remote file '{}' not found.".format(remotePath)) + # 上传本地文件 + try: + # 判断远程地址长度,过长需要截取一部分 + if len(remotePath) > 120: + remotePath = remotePath[:-20] + ".srt" + # 判断本地文件是否存在,存在则上传 + if os.path.exists(localPath): + Logger.info("本地文件 '{}' 存在,开始上传.".format(localPath)) + sftp_client.put(localPath, remotePath, confirm=False) + os.remove(localPath) + else: + Logger.info("本地文件 '{}' 不存在,无法上传.".format(localPath)) + except Exception as e: + Logger.info("上传失败 '{}' 文件名长度{}".format( + remotePath, len(remotePath))) + Logger.error(e) + sftp_client.close() + sftp_client = ssh_client.open_sftp() + # 发送钉钉消息 + webhook = "https://oapi.dingtalk.com/robot/send?access_token=c8c8d7d42c4eecd449dd303025ef968f647d1d8e8694e3fabc0ab5770d646dcb" + jsonData = { + "msgtype": "text", + "text": { + "content": "[Youtube]sftp finished" + } + } + requests.post(webhook, json=jsonData) + Logger.info("sftp发送钉钉消息成功...") diff --git a/sftp/sftp_config.ini b/sftp/sftp_config.ini new file mode 100644 index 0000000..125bc3f --- /dev/null +++ b/sftp/sftp_config.ini @@ -0,0 +1,5 @@ +[sftp_config] +hostname = filetransfer.blackrock.com +port = 22 +username = ftp_yunbo +password = s8v{8SJr diff --git a/src/ChannelService.py b/src/ChannelService.py new file mode 100644 index 0000000..3f74d22 --- /dev/null +++ b/src/ChannelService.py @@ -0,0 +1,15 @@ +import json +from Orm import Channel +from playhouse.shortcuts import model_to_dict, dict_to_model + + +class ChannelService: + def getOneByChannelId(channelId): + return Channel.get_or_none(Channel.channelId == channelId) + + def updateTimeByChannelId(channelId, chageTime): + Channel.update(channelReptileTime=chageTime).where( + Channel.channelId == channelId).execute() + + def getChannelList(): + return Channel.select().execute() diff --git a/src/Contant.py b/src/Contant.py new file mode 100644 index 0000000..3609b9d --- /dev/null +++ b/src/Contant.py @@ -0,0 +1,4 @@ +db="" +logDir="" +startTime="" +endTime="" \ No newline at end of file diff --git a/src/DownloadInfoService.py b/src/DownloadInfoService.py new file mode 100644 index 0000000..802e89f --- /dev/null +++ b/src/DownloadInfoService.py @@ -0,0 +1,16 @@ +from Orm import DownloadInfo + + +class DownloadService: + + def createOne(videoId, downloadType, tryTime, isFinished): + DownloadInfo.create( + videoId=videoId, + downloadType=downloadType, + tryTime=tryTime, + isFinished=isFinished + ) + + def updateInfoByVideoId(videoId, tryTime, isFinished): + DownloadInfo.update(tryTime=tryTime, isFinished=isFinished).where( + DownloadInfo.videoId == videoId).execute() diff --git a/src/LoggerUtils.py b/src/LoggerUtils.py new file mode 100644 index 0000000..90f071e --- /dev/null +++ b/src/LoggerUtils.py @@ -0,0 +1,6 @@ +from loguru import logger +import Contant +Logger = logger +def initLogger(): + logger.add(Contant.logDir+"/main_{time}.log", rotation="500MB", encoding="utf-8", + enqueue=True, compression="zip", retention="10 days") diff --git a/src/Orm.py b/src/Orm.py new file mode 100644 index 0000000..1e20a58 --- /dev/null +++ b/src/Orm.py @@ -0,0 +1,68 @@ +from peewee import * +import Contant +import argparse +from LoggerUtils import Logger + +parser = argparse.ArgumentParser(description='') +parser.add_argument('--db', type=str, default='') +parser.add_argument('--logDir', type=str, default='') +parser.add_argument("--start", type=str, default="") +parser.add_argument("--end", type=str, default="") +parser.add_argument("--channelId", type=str, default="") +args = parser.parse_args() +Contant.db = args.db +db = SqliteDatabase(Contant.db) + + +def ormInit(): + Channel.create_table() + Video.create_table() + DownloadInfo.create_table() + + +class BaseModel(Model): + class Meta: + database = db + +# 频道信息 + + +class Channel(BaseModel): + id = PrimaryKeyField() + channelId = CharField(null=False) + channelTitle = CharField(null=False) + channelLanguage = CharField() + channelReptileTime = CharField(null=True) + + class Meta: + db_table = 'Channel' + +# 视频信息 + + +class Video(BaseModel): + id = PrimaryKeyField() + videoId = CharField(null=False) + channelId = CharField(null=False) + videoTitle = CharField() + videoLen = IntegerField() + videoType = CharField() + videoPublishTime = CharField() + videoLanguage = CharField() + isDownload = IntegerField() + + class Meta: + db_table = 'Vidoes' + +# 下载信息 + + +class DownloadInfo(BaseModel): + id = PrimaryKeyField() + videoId = CharField() + downloadType = IntegerField() + tryTime = IntegerField() + isFinished = IntegerField() + + class Meta: + db_table = 'Download_info' diff --git a/src/SrcTest.py b/src/SrcTest.py new file mode 100644 index 0000000..e69de29 diff --git a/src/VideoService.py b/src/VideoService.py new file mode 100644 index 0000000..3273712 --- /dev/null +++ b/src/VideoService.py @@ -0,0 +1,31 @@ +import json +from Orm import Video +from playhouse.shortcuts import model_to_dict, dict_to_model + + +class VideoService: + def getOneByVideoId(videoId): + return Video.get_or_none(Video.videoId == videoId) + + def createOne(videoId, channelId, videoTitle, videoLen, videoType, videoPublishTime, videoLanguage, isDownload): + Video.create(videoId=videoId, + channelId=channelId, + videoTitle=videoTitle, + videoLen=videoLen, + videoType=videoType, + videoPublishTime=videoPublishTime, + videoLanguage=videoLanguage, + isDownload=isDownload) + + def updateLenByVideoId(videoId, len): + Video.update(videoLen=len).where(Video.videoId == videoId).execute() + + def getLastVideoByChannelId(channelId): + return Video.select().where(Video.channelId == channelId).order_by(Video.videoPublishTime.desc()).get() + + def getFirstVideoByChannelId(channelId): + return Video.select().where(Video.channelId == channelId).order_by(Video.videoPublishTime).get() + + def checkExist(channelId): + query = Video.select().where(Video.channelId == channelId) + return query.exists() diff --git a/src/YouTubeUtils.py b/src/YouTubeUtils.py new file mode 100644 index 0000000..fd3d727 --- /dev/null +++ b/src/YouTubeUtils.py @@ -0,0 +1,169 @@ +import httplib2 +import googleapiclient.discovery +import googleapiclient.errors +from VideoService import VideoService +from ChannelService import ChannelService +from DownloadInfoService import DownloadService +from LoggerUtils import Logger +import operator +import time + + +class YouTubeUtil: + # apiKeys = ["AIzaSyDlRgmPXVQEjF2gbmomI5FUZX_uAOBmEGI", "AIzaSyBI5i5vFZpQErMnEXKMf0VUS2Bel8jGrTk", + # "AIzaSyAnmA0Ggy1yXsZZACfItmeZAa7wcmh6SbM", "AIzaSyC4O8tBoAfkupmBybxDah2JUxgj4ct5uk0", + # "AIzaSyDJ2S9Ijhw_hULx3nHvPUoGUpMENbZOIl8", "AIzaSyA87Ckpna3hOQ31nISs8V8rp--OLw0m6Aw", + # "AIzaSyDIWbV0EOLHkOr9tWpANose6ggd2r9vcLg", "AIzaSyBKE3lYwWFIYc9Vx4YKMbRpkOXigZlY52U"] + + # AIzaSyCTBSbq0YjyxTtjmNsnDyKAwHamlv_ST-s + # AIzaSyAESnwtbTIBtU707iZowtQkmAo-qKuEOcY + # AIzaSyCsYUC5vN0pB6y9xsCj0B1ehAoqOJ3WMf0 + # AIzaSyDjPkCgDQ9Tv_xcChjY2E6GpJ6IzngnD5I + # AIzaSyAxIycOdQYGB5kWhwe3B-kJAYRo7wOnp8o + apiKeys = [ + "AIzaSyARaW3mqO9szQiHgWZR4el0HWvdyheSHBc", + "AIzaSyChPXesnVx6fweon_BckhR6UiJWvi5Ma4s" + + # "AIzaSyCTBSbq0YjyxTtjmNsnDyKAwHamlv_ST-s", + # "AIzaSyAESnwtbTIBtU707iZowtQkmAo-qKuEOcY" + + + # "AIzaSyDjPkCgDQ9Tv_xcChjY2E6GpJ6IzngnD5I", + # "AIzaSyAxIycOdQYGB5kWhwe3B-kJAYRo7wOnp8o", + # "AIzaSyCsYUC5vN0pB6y9xsCj0B1ehAoqOJ3WMf0" + ] + + apiIndex = 0 + + def getYoutube(): + # proxy_info = httplib2.ProxyInfo( + # proxy_type=httplib2.socks.PROXY_TYPE_HTTP, proxy_host="127.0.0.1", proxy_port=7890) + # http = httplib2.Http(timeout=10, proxy_info=proxy_info, + # disable_ssl_certificate_validation=False) + http = httplib2.Http(timeout=10, disable_ssl_certificate_validation=False) + api_service_name = "youtube" + api_version = "v3" + # 获取apiKey + apiKey = YouTubeUtil.apiKeys[YouTubeUtil.apiIndex] + Logger.info( + "当前APIKey:{},当前apiIndex:{},totalIndex:{}".format( + apiKey, YouTubeUtil.apiIndex, len(YouTubeUtil.apiKeys) - 1 + ) + ) + # 等于7,还原成0 + if YouTubeUtil.apiIndex == (len(YouTubeUtil.apiKeys) - 1): + YouTubeUtil.apiIndex = 0 + else: + YouTubeUtil.apiIndex = YouTubeUtil.apiIndex + 1 + + # 获取对象 + youtube = googleapiclient.discovery.build( + api_service_name, api_version, developerKey=apiKey, http=http + ) + return youtube + + def getVidoeLen(videoIds): + youtube = YouTubeUtil.getYoutube() + request = youtube.videos().list(part="contentDetails", id=videoIds) + response = request.execute() + response["items"][0]["contentDetails"] + return response + + def getVideoLenByStr(str): + len = 0 + str = str.split("PT")[1] + if operator.contains(str, "H"): + H = str.split("H")[0] + len = len + int(H) * 3600 + str = str.split("H")[1] + if operator.contains(str, "M"): + M = str.split("M")[0] + len = len + int(M) * 60 + str = str.split("M")[1] + if operator.contains(str, "S"): + S = str.split("S")[0] + len = len + int(S) + return len + + def getByChannelId(channelId, startTime, endTime): + channel = ChannelService.getOneByChannelId(channelId) + if channel == None: + return + videoLanguage = str(channel.channelLanguage) + youtube = YouTubeUtil.getYoutube() + request = youtube.search().list( + part="snippet", + channelId=channelId, + maxResults=50, + order="date", + publishedAfter=startTime, + publishedBefore=endTime, + type="video", + ) + response = request.execute() + while True: + videosRequest = "" + videosRequestCount = 0 + idList = [] + for i in response["items"]: + try: + videoId = i["id"]["videoId"] + publisTime = i["snippet"]["publishedAt"] + videoTitle = i["snippet"]["title"] + videoType = "video" + videoEntity = VideoService.getOneByVideoId(str(videoId)) + if videoEntity == None: + VideoService.createOne( + videoId, + channelId, + videoTitle, + 0, + videoType, + publisTime, + videoLanguage, + 0, + ) + DownloadService.createOne(videoId, 1, 0, 0) + videosRequest = videosRequest + "," + str(videoId) + videosRequestCount = videosRequestCount + 1 + Logger.info( + "存储VideoUrl:https://www.youtube.com/watch?v=" + videoId + ) + else: + Logger.info("已存在VideoId:{}".format(videoId)) + idList.append(str(videoId)) + if videosRequest != "" and videosRequestCount >= 10: + lenRes = YouTubeUtil.getVidoeLen(videosRequest) + for i in lenRes["items"]: + tmpId = i["id"] + videoLenStr = i["contentDetails"]["duration"] + videoLen = YouTubeUtil.getVideoLenByStr(videoLenStr) + VideoService.updateLenByVideoId(tmpId, videoLen) + Logger.info( + "更新时长,videoId:{},len:{}".format(tmpId, videoLen) + ) + videosRequestCount = 0 + videosRequest = "" + except: + pass + # 获取最后一个视频 + vidoeo = VideoService.getLastVideoByChannelId(channelId) + ChannelService.updateTimeByChannelId(channelId, vidoeo.videoPublishTime) + time.sleep(5) + try: + # youtube = YouTubeUtil.getYoutube + request = youtube.search().list( + part="snippet", + channelId=channelId, + maxResults=50, + order="date", + publishedAfter=startTime, + publishedBefore=endTime, + type="video", + pageToken=response["nextPageToken"], + ) + response = request.execute() + except Exception as e: + Logger.error(e) + print("no nextPageToken") + break diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..ab7880b --- /dev/null +++ b/src/main.py @@ -0,0 +1,49 @@ +import argparse +import Contant +import LoggerUtils +import Orm +from VideoService import VideoService +from YouTubeUtils import YouTubeUtil +from ChannelService import ChannelService +import requests + +# py .\main.py --db=../db/youtube_prod.db --logDir=./logs --start="2023-09-10T00:00:01Z" --end="2023-09-11T00:00:01Z" +# python3 ./main.py --db="../db/youtube_prod.db" --logDir="./logs" --start="2024-03-25T00:10:01Z" --end="2024-03-26T00:10:01Z" +# python3 ./main.py --db="../db/youtube_prod.db" --logDir="./logs" --start="111" --end="222" +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="") + parser.add_argument("--db", type=str, default="") + parser.add_argument("--logDir", type=str, default="") + parser.add_argument("--start", type=str, default="") + parser.add_argument("--end", type=str, default="") + args = parser.parse_args() + Contant.db = args.db + Contant.logDir = args.logDir + Contant.startTime = args.start + Contant.endTime = args.end + LoggerUtils.initLogger() + Orm.ormInit() + LoggerUtils.Logger.info("db:{},logDir:{}".format(Contant.db, Contant.logDir)) + LoggerUtils.Logger.info("starTime:{},endTime:{}".format(Contant.startTime, Contant.endTime)) + + # 执行查询 + channelList = ChannelService.getChannelList() + LoggerUtils.Logger.info("list size:{}".format(len(channelList))) + for channel in channelList: + channelId = channel.channelId + LoggerUtils.Logger.info( + "channelId:{},startTime:{},endTime:{}".format( + channelId, Contant.startTime, Contant.endTime + ) + ) + YouTubeUtil.getByChannelId(channelId, Contant.startTime, Contant.endTime) + # 发送钉钉消息 + webhook = "https://oapi.dingtalk.com/robot/send?access_token=c8c8d7d42c4eecd449dd303025ef968f647d1d8e8694e3fabc0ab5770d646dcb" + jsonData = { + "msgtype": "text", + "text": { + "content": "[Youtube]src finished" + } + } + requests.post(webhook, json=jsonData) + LoggerUtils.Logger.info("src发送钉钉消息成功...") diff --git a/src/one_channel.py b/src/one_channel.py new file mode 100644 index 0000000..251e4ff --- /dev/null +++ b/src/one_channel.py @@ -0,0 +1,32 @@ +import argparse +import Contant +import LoggerUtils +import Orm +from VideoService import VideoService +from YouTubeUtils import YouTubeUtil +from ChannelService import ChannelService +import requests + +# py .\main.py --db=../db/youtube_prod.db --logDir=./logs --start="2023-09-10T00:00:01Z" --end="2023-09-11T00:00:01Z" +# python3 ./main.py --db="../db/youtube_prod.db" --logDir="./logs" --start="2023-08-10T00:00:01Z" --end="2023-09-12T00:00:01Z" +# python3 ./main.py --db="../db/youtube_prod.db" --logDir="./logs" --start="111" --end="222" +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCzoF2M_RG3Qz10hP16vQOng" +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="") + parser.add_argument("--db", type=str, default="") + parser.add_argument("--logDir", type=str, default="") + parser.add_argument("--start", type=str, default="") + parser.add_argument("--end", type=str, default="") + parser.add_argument("--channelId", type=str, default="") + args = parser.parse_args() + Contant.db = args.db + Contant.logDir = args.logDir + Contant.startTime = args.start + Contant.endTime = args.end + channelId = args.channelId + LoggerUtils.initLogger() + Orm.ormInit() + LoggerUtils.Logger.info("db:{},logDir:{}".format(Contant.db, Contant.logDir)) + LoggerUtils.Logger.info("channleId:{},starTime:{},endTime:{}".format(channelId, Contant.startTime, Contant.endTime)) + + YouTubeUtil.getByChannelId(channelId, Contant.startTime, Contant.endTime) diff --git a/src_tmp.sh b/src_tmp.sh new file mode 100644 index 0000000..cb9eb25 --- /dev/null +++ b/src_tmp.sh @@ -0,0 +1,87 @@ +#!/bin/bash +cd /mnt/youtube_prod/src + + + +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCpsfkRRT7L2nBnizBn_u9YA" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCRbT3P-2tmr-9l8D7jNoZMQ" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCPTy0BNqiv-0SdAvFgrXvXg" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCMlDu8Vuowmqz03kByFcUhw" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UC5mn3VEg_9GY52G6eumKJRg" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UClhhyZ0xyeOAEVdcr0N9KDA" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCBM86JVoHLqg9irpR2XKvGw" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCzp9CmDIFVNtzhyOjptIi4g" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCv-spDeZBGYVUI9eGXGaLSg" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCF08I8KEKTsBo22RIXFwTAA" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UC5Mjj4LKlMtP_PXlIVYGxIQ" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCvil4OAt-zShzkKHsg9EQAw" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCI6C5V4J8FWRcLcOdh1yElw" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCOio3vyYLWiKlHSYRKW-9UA" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCaWi2foADm_lKAKnmeQwLSA" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCUFUOdQwKTWda7kKqxQwMxw" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCoZdXdFowKP0heWRkQ9RABQ" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCnfwIKyFYRuqZzzKBDt6JOA" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCnZJqzwt6LuRymM0jbqiD9A" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCHpGooMnVgnILywqrpqvZcQ" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCQIyAcoLsO3L0RMFQk7YMYA" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCYdHxiRAUUJhuE1DZsnWqXg" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCbOIEn95Rvnk97KRtSFqvbQ" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCXWOlSe2GHTev8QZhY_gMPg" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCJo6G1u0e_-wS-JQn3T-zEw" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCfq4V1DAuaojnr2ryvWNysw" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCFznPlqnBtRKQhtkm6GGoRQ" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UC5CyCSvCdoEP-VgQmFq3iww" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UC6mp159KMtzjhP65DmldR0A" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UC7YLvjJf3lDJUQ-TsbWyBjg" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UC6ij59Gy_HnqO4pFu9A_zgQ" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCpyjRAERLqcD_wI3qQnIY3A" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCSU_iBWoCnXe1VnAbQhO3Ug" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UC6ZkHcW5QQubZ-Q6XYINE3Q" +# sleep 600 +# python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCDpRrAXMYlxFz3a5-z8pE7w" +# sleep 600 +python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCMec1m9iUC3agiEK-nsndSg" +sleep 600 +python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCOmXyHRWpDFPYgs2VpoQEIw" +sleep 600 +python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCPgT-N-DQ0K0H88skjaDgkA" +sleep 600 +python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UC40nk9kM2Ue8XQ9LsHQlKPA" +sleep 600 +python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCaiV1-PUXDu2Nmx8iOZkofQ" +sleep 600 +python3 ./one_channel.py --db="../db/youtube_prod.db" --logDir="./logs/one_channel" --start="2021-03-06T00:00:01Z" --end="2024-03-06T00:00:01Z" --channelId="UCDDneQi63kJAdr3i5VCPzHg" +sleep 600 \ No newline at end of file diff --git a/start_download.sh b/start_download.sh new file mode 100644 index 0000000..c5150f5 --- /dev/null +++ b/start_download.sh @@ -0,0 +1,10 @@ +#!/bin/bash +function log() { + local time_now=`date '+%Y-%m-%d %H:%M:%S'` + echo "$time_now [download] [info] $1" >> /mnt/youtube_prod/running.log +} + +cd /mnt/youtube_prod/download +# /mnt/youtube_prod/start_download.sh +log "开始执行download..." +nohup python3 ./main_download.py --db="../db/youtube_prod.db" --logDir="./logs" >/dev/null 2>/mnt/youtube_prod/err.log & \ No newline at end of file diff --git a/start_sftp.sh b/start_sftp.sh new file mode 100644 index 0000000..8b93512 --- /dev/null +++ b/start_sftp.sh @@ -0,0 +1,11 @@ +#!/bin/bash +function log() { + local time_now=`date '+%Y-%m-%d %H:%M:%S'` + echo "$time_now [download] [info] $1" >> /mnt/youtube_prod/running.log +} + +cd /mnt/youtube_prod/sftp +# /mnt/youtube_prod/start_download.sh +log "开始执行sftp..." +python3 ./sftp.py --local="/mnt/tmp_srt_file" --logDir="./logs" +rm -rf /mnt/tmp_srt_file \ No newline at end of file diff --git a/start_src.sh b/start_src.sh new file mode 100644 index 0000000..94f7f5b --- /dev/null +++ b/start_src.sh @@ -0,0 +1,12 @@ +#!/bin/bash +function log() { + local time_now=`date '+%Y-%m-%d %H:%M:%S'` + echo "$time_now [src] [info] $1" >> /mnt/youtube_prod/running.log +} + +cd /mnt/youtube_prod/src +start=`date '+%Y-%m-%dT%H:%M:%SZ' -d'-1 day'` +end=`date '+%Y-%m-%dT%H:%M:%SZ'` +log "开始执行src...startTime:"$start",endTime:"$end +# /mnt/youtube_prod/start_src.sh +nohup python3 ./main.py --db="../db/youtube_prod.db" --logDir="./logs" --start=$start --end=$end >/dev/null 2>/mnt/youtube_prod/err.log & \ No newline at end of file diff --git a/stop_download.sh b/stop_download.sh new file mode 100644 index 0000000..2b0d345 --- /dev/null +++ b/stop_download.sh @@ -0,0 +1,4 @@ +#!/bin/bash +pid=`ps -ef | grep main_download | awk NR==1'{print $2}'` +echo $pid +kill -9 $pid \ No newline at end of file diff --git a/test.sh b/test.sh new file mode 100644 index 0000000..4e66871 --- /dev/null +++ b/test.sh @@ -0,0 +1,2 @@ +#!/bin/bash +echo "test" \ No newline at end of file diff --git a/test/test.py b/test/test.py new file mode 100644 index 0000000..2a42732 --- /dev/null +++ b/test/test.py @@ -0,0 +1,9 @@ +import requests +webhook = "https://oapi.dingtalk.com/robot/send?access_token=c8c8d7d42c4eecd449dd303025ef968f647d1d8e8694e3fabc0ab5770d646dcb" +jsonData = { + "msgtype": "text", + "text": { + "content": "[Youtube]aaaa" + } +} +requests.post(webhook, json=jsonData) diff --git a/test/test2.py b/test/test2.py new file mode 100644 index 0000000..dd8b022 --- /dev/null +++ b/test/test2.py @@ -0,0 +1,8 @@ +from youtube_transcript_api import YouTubeTranscriptApi +#zh-Hant +url = "https://www.youtube.com/watch?v=YbVger_nh-s" +list = YouTubeTranscriptApi.list_transcripts("_i5CoY_LMYs") +# videoSrt = YouTubeTranscriptApi.get_transcript( +# "gXeNXJrD-gw", languages=['zh-TW']) +print(list) +# print(videoSrt) \ No newline at end of file diff --git a/view_count/ChannelService.py b/view_count/ChannelService.py new file mode 100644 index 0000000..0a57c70 --- /dev/null +++ b/view_count/ChannelService.py @@ -0,0 +1,15 @@ +import json +from Orm import Channel +from playhouse.shortcuts import model_to_dict, dict_to_model + + +class ChannelService: + def getOneByChannelId(channelId): + return Channel.get_or_none(Channel.channelId == channelId) + + def updateTimeByChannelId(channelId, chageTime): + Channel.update(channelReptileTime=chageTime).where( + Channel.channelId == channelId).execute() + + def getChannelList(): + return Channel.select().execute() \ No newline at end of file diff --git a/view_count/Contant.py b/view_count/Contant.py new file mode 100644 index 0000000..63850f7 --- /dev/null +++ b/view_count/Contant.py @@ -0,0 +1,10 @@ +db="" +logDir="" +startTime="" +endTime="" +apiIndex = 0 +apiKeys = [ + "AIzaSyDjPkCgDQ9Tv_xcChjY2E6GpJ6IzngnD5I", + "AIzaSyAxIycOdQYGB5kWhwe3B-kJAYRo7wOnp8o", + "AIzaSyCsYUC5vN0pB6y9xsCj0B1ehAoqOJ3WMf0" +] \ No newline at end of file diff --git a/view_count/LoggerUtils.py b/view_count/LoggerUtils.py new file mode 100644 index 0000000..90f071e --- /dev/null +++ b/view_count/LoggerUtils.py @@ -0,0 +1,6 @@ +from loguru import logger +import Contant +Logger = logger +def initLogger(): + logger.add(Contant.logDir+"/main_{time}.log", rotation="500MB", encoding="utf-8", + enqueue=True, compression="zip", retention="10 days") diff --git a/view_count/Orm.py b/view_count/Orm.py new file mode 100644 index 0000000..2286c3c --- /dev/null +++ b/view_count/Orm.py @@ -0,0 +1,75 @@ +from peewee import * +import Contant +import argparse +from LoggerUtils import Logger + +parser = argparse.ArgumentParser(description='') +parser.add_argument('--db', type=str, default='') +parser.add_argument('--logDir', type=str, default='') +args = parser.parse_args() +Contant.db = args.db +db = SqliteDatabase(Contant.db) + + +def ormInit(): + Channel.create_table() + Video.create_table() + DownloadInfo.create_table() + ViewCountInfo.create_table() + + +class BaseModel(Model): + class Meta: + database = db + +# 频道信息 + + +class Channel(BaseModel): + id = PrimaryKeyField() + channelId = CharField(null=False) + channelTitle = CharField(null=False) + channelLanguage = CharField() + channelReptileTime = CharField(null=True) + + class Meta: + db_table = 'Channel' + +# 视频信息 + + +class Video(BaseModel): + id = PrimaryKeyField() + videoId = CharField(null=False) + channelId = CharField(null=False) + videoTitle = CharField() + videoLen = IntegerField() + videoType = CharField() + videoPublishTime = CharField() + videoLanguage = CharField() + isDownload = IntegerField() + + class Meta: + db_table = 'Vidoes' + +# 下载信息 + + +class DownloadInfo(BaseModel): + id = PrimaryKeyField() + videoId = CharField() + downloadType = IntegerField() + tryTime = IntegerField() + isFinished = IntegerField() + + class Meta: + db_table = 'Download_info' + +# 播放量信息 +class ViewCountInfo(BaseModel): + id = PrimaryKeyField() + videoId = CharField() + viewCount = CharField() + + class Meta: + db_table = 'ViewCount_info' \ No newline at end of file diff --git a/view_count/VideoCountService.py b/view_count/VideoCountService.py new file mode 100644 index 0000000..4349a8f --- /dev/null +++ b/view_count/VideoCountService.py @@ -0,0 +1,33 @@ +import json +from Orm import ViewCountInfo +from playhouse.shortcuts import model_to_dict, dict_to_model + + +class ViewCountService: + def createOrUpdateOne(videoId, day,count): + query = ViewCountInfo.select().where(ViewCountInfo.videoId == videoId) + if not query: + countStr = "0" + for i in range(0,30): + if i != 29: + countStr = countStr + "," + "0" + list = countStr.split(",") + list[day-1] = count + countStr = "" + for i in range(0,30): + if i != 29: + countStr = countStr + str(list[i]) + "," + else: + countStr = countStr + str(list[i]) + ViewCountInfo.create(videoId=videoId, viewCount=countStr) + else: + viewCountInfo = ViewCountInfo.select().where(ViewCountInfo.videoId == videoId).get() + list = viewCountInfo.viewCount.split(",") + list[day-1] = count + countStr = "" + for i in range(0,30): + if i != 29: + countStr = countStr + str(list[i]) + "," + else: + countStr = countStr + str(list[i]) + ViewCountInfo.update(viewCount=countStr).where(ViewCountInfo.videoId == videoId).execute() \ No newline at end of file diff --git a/view_count/VideoService.py b/view_count/VideoService.py new file mode 100644 index 0000000..0b613a9 --- /dev/null +++ b/view_count/VideoService.py @@ -0,0 +1,34 @@ +import json +from Orm import Video +from playhouse.shortcuts import model_to_dict, dict_to_model + + +class VideoService: + def getOneByVideoId(videoId): + return Video.get_or_none(Video.videoId == videoId) + + def createOne(videoId, channelId, videoTitle, videoLen, videoType, videoPublishTime, videoLanguage, isDownload): + Video.create(videoId=videoId, + channelId=channelId, + videoTitle=videoTitle, + videoLen=videoLen, + videoType=videoType, + videoPublishTime=videoPublishTime, + videoLanguage=videoLanguage, + isDownload=isDownload) + + def updateLenByVideoId(videoId, len): + Video.update(videoLen=len).where(Video.videoId == videoId).execute() + + def getLastVideoByChannelId(channelId): + return Video.select().where(Video.channelId == channelId).order_by(Video.videoPublishTime.desc()).get() + + def getFirstVideoByChannelId(channelId): + return Video.select().where(Video.channelId == channelId).order_by(Video.videoPublishTime).get() + + def checkExist(channelId): + query = Video.select().where(Video.channelId == channelId) + return query.exists() + + def getVideosByTime(startTime,endTime): + return Video.select().where(Video.videoPublishTime >= startTime,Video.videoPublishTime <= endTime).execute() diff --git a/view_count/view_count_main.py b/view_count/view_count_main.py new file mode 100644 index 0000000..98fc806 --- /dev/null +++ b/view_count/view_count_main.py @@ -0,0 +1,99 @@ +import argparse +import random +import time +import Contant +from LoggerUtils import Logger, initLogger +import Orm +from VideoService import VideoService +from ChannelService import ChannelService +from VideoCountService import ViewCountService +from func_timeout import func_set_timeout +import func_timeout +import requests +import httplib2 +import googleapiclient.discovery +import googleapiclient.errors +import datetime + +apiIndex = 0 +apiKeys = [ + "AIzaSyDjPkCgDQ9Tv_xcChjY2E6GpJ6IzngnD5I", + "AIzaSyAxIycOdQYGB5kWhwe3B-kJAYRo7wOnp8o", + "AIzaSyCsYUC5vN0pB6y9xsCj0B1ehAoqOJ3WMf0" +] + + +def getYoutube(): + proxy_info = httplib2.ProxyInfo( + proxy_type=httplib2.socks.PROXY_TYPE_HTTP, proxy_host="127.0.0.1", proxy_port=7890) + # http = httplib2.Http(timeout=10, proxy_info=proxy_info, + # disable_ssl_certificate_validation=False) + http = httplib2.Http(timeout=10, disable_ssl_certificate_validation=False) + # http = httplib2.Http(timeout=10, disable_ssl_certificate_validation=False) + api_service_name = "youtube" + api_version = "v3" + # 获取apiKey + apiKey = "AIzaSyARaW3mqO9szQiHgWZR4el0HWvdyheSHBc" + + # 获取对象 + youtube = googleapiclient.discovery.build( + api_service_name, api_version, developerKey=Contant.apiKeys[Contant.apiIndex], http=http + ) + return youtube + + +def updateVideoViewCount(day, startTime, endTime): + list = VideoService.getVideosByTime(startTime, endTime) + Logger.info(len(list)) + videoCount = 0 + totalCount = 0 + videosRequest = "" + youtube = getYoutube() + for video in list: + videoCount = videoCount + 1 + totalCount = totalCount + 1 + Logger.info(video.videoId) + videosRequest = videosRequest + "," + video.videoId + if videoCount == 50 or videoCount == len(list) or totalCount == len(list): + request = youtube.videos().list(part="statistics", id=videosRequest) + if Contant.apiIndex < (len(Contant.apiKeys) - 1): + Contant.apiIndex = Contant.apiIndex + 1 + else: + Contant.apiIndex = 0 + response = request.execute() + for item in response['items']: + try: + Logger.info(item) + ViewCountService.createOrUpdateOne( + item['id'], day, item['statistics']['viewCount']) + except Exception as e: + Logger.error("存储失败{}".format(item)) + videosRequest = "" + videoCount = 0 + +# python ./view_count_main.py --db="../db/youtube_prod.db" --logDir="./logs" --start="2024-01-03T00:00:00Z" --end="2024-01-04T00:00:00Z" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='') + parser.add_argument('--db', type=str, default='') + parser.add_argument('--logDir', type=str, default='') + args = parser.parse_args() + Contant.db = args.db + Contant.logDir = args.logDir + initLogger() + Orm.ormInit() + # 查询30天内的所有视屏 + now = datetime.datetime.now() + zero_today = now.replace(hour=0, minute=0, second=0, microsecond=0) + end_today = now.replace(hour=23, minute=59, second=59, microsecond=0) + for i in range(1, 31): + startTime = zero_today+datetime.timedelta(days=-i) + endTime = end_today+datetime.timedelta(days=-i) + startTime = startTime.strftime("%Y-%m-%dT%H:%S:%MZ") + endTime = endTime.strftime("%Y-%m-%dT%H:%S:%MZ") + Logger.info("day:%d, startTime:%s, endTime:%s" % + (i, startTime, endTime)) + updateVideoViewCount(i, startTime, endTime) + # zero_today = zero_today.strftime("%y-%m-%dT%H:%S:%MZ") + # print(zero_today)