diff --git a/.gitignore b/.gitignore index f8b73e7..ce3ee86 100644 --- a/.gitignore +++ b/.gitignore @@ -138,3 +138,4 @@ dmypy.json # Cython debug symbols cython_debug/ +*/logs \ No newline at end of file diff --git a/db/youtube_prod.db b/db/youtube_prod.db new file mode 100644 index 0000000..83b31e3 Binary files /dev/null and b/db/youtube_prod.db differ diff --git a/download/ChannelService.py b/download/ChannelService.py new file mode 100644 index 0000000..362cb9d --- /dev/null +++ b/download/ChannelService.py @@ -0,0 +1,7 @@ +import json +from Orm import Channel +from playhouse.shortcuts import model_to_dict, dict_to_model + +class ChannelService: + def getOneByChannelId(channelId): + return Channel.get_or_none(Channel.channelId == channelId) \ No newline at end of file diff --git a/download/Contant.py b/download/Contant.py new file mode 100644 index 0000000..b4b3bfa --- /dev/null +++ b/download/Contant.py @@ -0,0 +1,2 @@ +db="" +logDir="" \ No newline at end of file diff --git a/download/DownloadInfoService.py b/download/DownloadInfoService.py new file mode 100644 index 0000000..d01472a --- /dev/null +++ b/download/DownloadInfoService.py @@ -0,0 +1,28 @@ +from Orm import DownloadInfo + + +class DownloadService: + def getOneByVideoId(videoId, downloadType): + return DownloadInfo.get(DownloadInfo.videoId == videoId, DownloadInfo.downloadType == downloadType) + + def createOne(videoId, downloadType, tryTime, isFinished): + DownloadInfo.create( + videoId=videoId, + downloadType=downloadType, + tryTime=tryTime, + isFinished=isFinished + ) + + def updateInfoByVideoId(videoId, tryTime, isFinished, downloadType): + DownloadInfo.update(tryTime=tryTime, isFinished=isFinished).where( + DownloadInfo.videoId == videoId, DownloadInfo.downloadType == downloadType).execute() + + def findNotFinishList(): + return DownloadInfo.select().where(DownloadInfo.isFinished == 0, DownloadInfo.tryTime <= 5, DownloadInfo.downloadType == 1).limit(10).execute() + + def changeDownloadType(videoId, tryTime, isFinished, downloadType, changeType): + DownloadInfo.update(tryTime=tryTime, isFinished=isFinished, downloadType=changeType).where( + DownloadInfo.videoId == videoId, DownloadInfo.downloadType == downloadType).execute() + + def findNotFinishListTwo(): + return DownloadInfo.select().where(DownloadInfo.isFinished == 0, DownloadInfo.tryTime <= 5, DownloadInfo.downloadType == 2).limit(10).execute() diff --git a/download/DownloadUtil.py b/download/DownloadUtil.py new file mode 100644 index 0000000..686a7fc --- /dev/null +++ b/download/DownloadUtil.py @@ -0,0 +1,173 @@ +from shutil import copyfile +from youtube_transcript_api import YouTubeTranscriptApi +from youtube_transcript_api.formatters import SRTFormatter +from VideoService import VideoService +from ChannelService import ChannelService +from DownloadInfoService import DownloadService +from LoggerUtils import Logger +import time +import os +from func_timeout import func_set_timeout +import operator + + +class DownLoadUtil: + + formatter = SRTFormatter() + proxies = {"http": "http://127.0.0.1:7890", + "https": "https://127.0.0.1:7890"} + + @func_set_timeout(60) + def downloadOne(videoId): + # 获取数据 + video = VideoService.getOneByVideoId(videoId) + channel = ChannelService.getOneByChannelId(str(video.channelId)) + # 格式化title + videoTitle = str(video.videoTitle) + videoTitle = str(videoTitle).replace("/", u"\u2215") + videoTitle = str(videoTitle).replace("?", "?") + videoTitle = str(videoTitle).replace("\\", "") + videoTitle = str(videoTitle).replace("|", "") + videoTitle = str(videoTitle).replace("<", "") + videoTitle = str(videoTitle).replace(">", "") + videoTitle = str(videoTitle).replace(":", "") + videoTitle = str(videoTitle).replace("में","") + # 获取发布时间 + videoPublishTime = str(video.videoPublishTime) + videoPublishTime = str(videoPublishTime).split("T")[0] + # 开始下载 + Logger.info("开始下载...{}".format(videoId)) + cpPath = "" + try: + # 获取字幕 + languages = str(video.videoLanguage) + storePath = "E:/code/python/srt_file/" + str(channel.channelTitle).rstrip() + cpPath = "E:/code/python/tmp_srt_file/" + str(channel.channelTitle).rstrip() + if not os.path.exists(storePath): + Logger.info("开始创建文件夹:" + storePath) + os.makedirs(storePath) + if not os.path.exists(cpPath): + Logger.info("开始创建文件夹:" + cpPath) + os.makedirs(cpPath) + storePath = storePath + "/" + videoPublishTime + \ + "-" + languages + "-" + videoTitle + ".srt" + cpPath = cpPath + "/" + videoPublishTime + \ + "-" + languages + "-" + videoTitle + ".srt" + if len(cpPath) > 120: + storePath = storePath[:-20] + ".srt" + cpPath = cpPath[:-20] + ".srt" + videoSrt = YouTubeTranscriptApi.get_transcript( + videoId, languages=[languages]) + srt_formatted = DownLoadUtil.formatter.format_transcript(videoSrt) + Logger.info("文件地址...{}".format(storePath)) + with open(storePath, 'w', encoding='utf-8') as srt_file: + srt_file.write(srt_formatted) + Logger.info("下载完成...{}".format(videoId)) + copyfile(storePath, cpPath) + # 修改video数据 + VideoService.updateIsDownloadByVideoId(videoId, 1) + # 修改downloadInfo + downloadInfo = DownloadService.getOneByVideoId(videoId, 1) + if downloadInfo is not None: + DownloadService.updateInfoByVideoId( + videoId, downloadInfo.tryTime + 1, 1, 1) + except Exception as e: + Logger.error("下载失败...{}".format(videoId)) + logStr = "Exception...{}".format(e) + Logger.error(logStr) + downloadInfo = DownloadService.getOneByVideoId(videoId, 1) + if operator.contains(logStr, "No transcripts"): + Logger.error("VideoId:{},不存在字幕文件".format(videoId)) + if downloadInfo is not None: + DownloadService.changeDownloadType( + videoId, 0, 0, 1, 2) + elif operator.contains(logStr, "File name too long"): + # 文件名过长 + languages = str(video.videoLanguage) + videoSrt = YouTubeTranscriptApi.get_transcript( + videoId, languages=[languages]) + srt_formatted = DownLoadUtil.formatter.format_transcript(videoSrt) + storePath = "E:/code/python/srt_file" + str(channel.channelTitle) + "/" + \ + videoPublishTime + "-" + languages + "-" + videoId + ".srt" + cpPath = "E:/code/python/tmp_srt_file/" + str(channel.channelTitle) + "/" + \ + videoPublishTime + "-" + languages + "-" + videoId + ".srt" + if len(cpPath) > 120: + storePath = storePath[:-20] + ".srt" + cpPath = cpPath[:-20] + ".srt" + Logger.info("文件名过长,文件地址...{}".format(storePath)) + with open(storePath, 'w', encoding='utf-8') as srt_file: + srt_file.write(srt_formatted) + Logger.info("下载完成...{}".format(videoId)) + copyfile(storePath, cpPath) + # 修改video数据 + VideoService.updateIsDownloadByVideoId(videoId, 1) + # 修改downloadInfo + downloadInfo = DownloadService.getOneByVideoId(videoId, 1) + if downloadInfo is not None: + DownloadService.updateInfoByVideoId( + videoId, downloadInfo.tryTime + 1, 1, 1) + else: + if downloadInfo is not None: + Logger.info("VideoId:{}开始重试第{}次".format( + videoId, downloadInfo.tryTime + 1)) + DownloadService.updateInfoByVideoId( + videoId, downloadInfo.tryTime + 1, 0, 1) + + @func_set_timeout(60) + def downloadTwo(videoId): + # 获取数据 + video = VideoService.getOneByVideoId(videoId, 2) + channel = ChannelService.getOneByChannelId(str(video.channelId)) + # 格式化title + videoTitle = str(video.videoTitle) + videoTitle = str(videoTitle).replace("/", u"\u2215") + videoTitle = str(videoTitle).replace("?", "?") + videoTitle = str(videoTitle).replace("\\", "") + videoTitle = str(videoTitle).replace("|", "") + videoTitle = str(videoTitle).replace("<", "") + videoTitle = str(videoTitle).replace(">", "") + videoTitle = str(videoTitle).replace(":", "") + # 获取发布时间 + videoPublishTime = str(video.videoPublishTime) + videoPublishTime = str(videoPublishTime).split("T")[0] + # 开始下载 + Logger.info("开始下载...{}".format(videoId)) + try: + # 获取字幕 + languages = str(video.videoLanguage) + storePath = "./download/" + str(channel.channelTitle) + if not os.path.exists(storePath): + Logger.info("开始创建文件夹:" + storePath) + os.makedirs(storePath) + storePath = storePath + "\\" + videoPublishTime + \ + "-" + languages + "-" + videoTitle + ".srt" + videoSrt = YouTubeTranscriptApi.get_transcript( + videoId, languages=[languages]) + srt_formatted = DownLoadUtil.formatter.format_transcript(videoSrt) + Logger.info("文件地址...{}".format(storePath)) + with open(storePath, 'w', encoding='utf-8') as srt_file: + srt_file.write(srt_formatted) + Logger.info("下载完成...{}".format(videoId)) + # 修改video数据 + VideoService.updateIsDownloadByVideoId(videoId, 1) + # 修改downloadInfo + downloadInfo = DownloadService.getOneByVideoId(videoId, 2) + if downloadInfo is not None: + DownloadService.updateInfoByVideoId( + videoId, downloadInfo.tryTime + 1, 1, 2) + except Exception as e: + Logger.error("下载失败...{}".format(videoId)) + logStr = "Exception...{}".format(e) + Logger.error(logStr) + downloadInfo = DownloadService.getOneByVideoId(videoId, 2) + if operator.contains(logStr, "No transcripts"): + Logger.error("VideoId:{},不存在字幕文件".format(videoId)) + if downloadInfo is not None: + DownloadService.changeDownloadType( + videoId, 6, 0, 2, 3) + else: + if downloadInfo is not None: + Logger.info("VideoId:{}开始重试第{}次".format( + videoId, downloadInfo.tryTime + 1)) + DownloadService.updateInfoByVideoId( + videoId, downloadInfo.tryTime + 1, 0, 2) diff --git a/download/LoggerUtils.py b/download/LoggerUtils.py new file mode 100644 index 0000000..b7cc4d6 --- /dev/null +++ b/download/LoggerUtils.py @@ -0,0 +1,6 @@ +from loguru import logger +import Contant +Logger = logger +def initLogger(): + logger.add(Contant.logDir+"/download_{time}.log", rotation="500MB", encoding="utf-8", + enqueue=True, compression="zip", retention="10 days") diff --git a/download/Orm.py b/download/Orm.py new file mode 100644 index 0000000..bb11760 --- /dev/null +++ b/download/Orm.py @@ -0,0 +1,65 @@ +from peewee import * +import Contant +import argparse +from LoggerUtils import Logger + +parser = argparse.ArgumentParser(description='') +parser.add_argument('--db', type=str, default='') +parser.add_argument('--logDir', type=str, default='') +args = parser.parse_args() +Contant.db = args.db +db = SqliteDatabase(Contant.db) + + +def ormInit(): + Channel.create_table() + Video.create_table() + DownloadInfo.create_table() + + +class BaseModel(Model): + class Meta: + database = db + +# 频道信息 + + +class Channel(BaseModel): + id = PrimaryKeyField() + channelId = CharField(null=False) + channelTitle = CharField(null=False) + channelLanguage = CharField() + channelReptileTime = CharField(null=True) + + class Meta: + db_table = 'Channel' + +# 视频信息 + + +class Video(BaseModel): + id = PrimaryKeyField() + videoId = CharField(null=False) + channelId = CharField(null=False) + videoTitle = CharField() + videoLen = IntegerField() + videoType = CharField() + videoPublishTime = CharField() + videoLanguage = CharField() + isDownload = IntegerField() + + class Meta: + db_table = 'Vidoes' + +# 下载信息 + + +class DownloadInfo(BaseModel): + id = PrimaryKeyField() + videoId = CharField() + downloadType = IntegerField() + tryTime = IntegerField() + isFinished = IntegerField() + + class Meta: + db_table = 'Download_info' diff --git a/download/VideoService.py b/download/VideoService.py new file mode 100644 index 0000000..e3fdaef --- /dev/null +++ b/download/VideoService.py @@ -0,0 +1,26 @@ +import json +from Orm import Video +from playhouse.shortcuts import model_to_dict, dict_to_model + + +class VideoService: + def getOneByVideoId(videoId): + return Video.get_or_none(Video.videoId == videoId) + + def createOne(videoId, channelId, videoTitle, videoLen, videoType, videoPublishTime, videoLanguage, isDownload): + Video.create(videoId=videoId, + channelId=channelId, + videoTitle=videoTitle, + videoLen=videoLen, + videoType=videoType, + videoPublishTime=videoPublishTime, + videoLanguage=videoLanguage, + isDownload=isDownload) + + def updateLenByVideoId(videoId, len): + Video.update(videoLen=len).where(Video.videoId == videoId).execute() + + def updateIsDownloadByVideoId(videoId, isDownload): + Video.update(isDownload=isDownload).where( + Video.videoId == videoId).execute() + \ No newline at end of file diff --git a/download/download.zip b/download/download.zip new file mode 100644 index 0000000..784d41c Binary files /dev/null and b/download/download.zip differ diff --git a/download/main_download.py b/download/main_download.py new file mode 100644 index 0000000..74c010f --- /dev/null +++ b/download/main_download.py @@ -0,0 +1,49 @@ +import argparse +import random +import time +import Contant +from LoggerUtils import Logger, initLogger +import Orm +from VideoService import VideoService +from ChannelService import ChannelService +from DownloadInfoService import DownloadService +from DownloadUtil import DownLoadUtil +from func_timeout import func_set_timeout +import func_timeout +import requests + +# py ./main_download.py --db="../db/youtube_prod.db" --logDir="./logs" +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='') + parser.add_argument('--db', type=str, default='') + parser.add_argument('--logDir', type=str, default='') + args = parser.parse_args() + Contant.db = args.db + Contant.logDir = args.logDir + initLogger() + Orm.ormInit() + list = DownloadService.findNotFinishList() + Logger.info("list size:{}".format(len(list))) + while (len(list) > 0): + for info in list: + try: + DownLoadUtil.downloadOne(info.videoId) + restTime = random.randint(1, 3) + Logger.info("间隔{}秒后继续...".format(restTime)) + time.sleep(restTime) + except func_timeout.exceptions.FunctionTimedOut as e: + Logger.error("执行下载方法超时错误:{}".format(e)) + loopRestTime = random.randint(1, 3) + Logger.info("循环间隔{}秒后继续...".format(loopRestTime)) + time.sleep(loopRestTime) + list = DownloadService.findNotFinishList() + # 发送钉钉消息 + # webhook = "https://oapi.dingtalk.com/robot/send?access_token=c8c8d7d42c4eecd449dd303025ef968f647d1d8e8694e3fabc0ab5770d646dcb" + # jsonData = { + # "msgtype": "text", + # "text": { + # "content": "[Youtube]download finished" + # } + # } + # requests.post(webhook, json=jsonData) + # Logger.info("download发送钉钉消息成功...") \ No newline at end of file diff --git a/init/Contant.py b/init/Contant.py new file mode 100644 index 0000000..b4b3bfa --- /dev/null +++ b/init/Contant.py @@ -0,0 +1,2 @@ +db="" +logDir="" \ No newline at end of file diff --git a/init/LoggerUtils.py b/init/LoggerUtils.py new file mode 100644 index 0000000..4913e92 --- /dev/null +++ b/init/LoggerUtils.py @@ -0,0 +1,6 @@ +from loguru import logger +import Contant +Logger = logger +def initLogger(): + logger.add(Contant.logDir+"/init_{time}.log", rotation="500MB", encoding="utf-8", + enqueue=True, compression="zip", retention="10 days") diff --git a/init/Orm.py b/init/Orm.py new file mode 100644 index 0000000..e672dcb --- /dev/null +++ b/init/Orm.py @@ -0,0 +1,65 @@ +from peewee import * +import Contant +import argparse +from LoggerUtils import Logger + +parser = argparse.ArgumentParser(description='') +parser.add_argument('--db', type=str, default='') +parser.add_argument('--logDir', type=str, default='') +args = parser.parse_args() +Contant.db = args.db +db = SqliteDatabase(Contant.db) + + +def ormInit(): + Channel.create_table() + Vidoe.create_table() + DownloadInfo.create_table() + + +class BaseModel(Model): + class Meta: + database = db + +# 频道信息 + + +class Channel(BaseModel): + id = PrimaryKeyField() + channelId = CharField(null=False) + channelTitle = CharField(null=False) + channelLanguage = CharField() + channelReptileTime = CharField(null=True) + + class Meta: + db_table = 'Channel' + +# 视频信息 + + +class Vidoe(BaseModel): + id = PrimaryKeyField() + videoId = CharField(null=False) + channelId = CharField(null=False) + videoTitle = CharField() + videoLen = IntegerField() + videoType = CharField() + videoPublishTime = CharField() + videoLanguage = CharField() + isDownload = IntegerField() + + class Meta: + db_table = 'Vidoes' + +# 下载信息 + + +class DownloadInfo(BaseModel): + id = PrimaryKeyField() + videoId = CharField() + downloadType = IntegerField() + tryTime = IntegerField() + isFinished = IntegerField() + + class Meta: + db_table = 'Download_info' diff --git a/init/init.py b/init/init.py new file mode 100644 index 0000000..065a233 --- /dev/null +++ b/init/init.py @@ -0,0 +1,62 @@ +import time +from LoggerUtils import Logger, initLogger +import argparse +import Contant +from Orm import ormInit, Channel +import operator +from bs4 import BeautifulSoup as bs +from urllib.request import urlopen, Request + +# py .\init.py --db=../db/youtube_prod.db --logDir=./logs + + +def saveChannel(channelUrl, language): + Logger.info("频道链接:"+channelUrl) + channelId = "" + channelName = "" + url_opener = urlopen( + Request(channelUrl, headers={'User-Agent': 'Mozilla'})) + videoInfo = bs(url_opener, features="html.parser") + links = videoInfo.find_all("link") + for link in links: + if operator.contains(str(link), "canonical"): + channelId = str(link['href']).split("/channel/")[1] + if operator.contains(str(link), "content="): + channelName = str(link['content']) + Logger.info("channelId:"+channelId) + Logger.info("channelName:"+channelName) + channel = Channel.get_or_none(Channel.channelId == channelId) + if channel != None: + Logger.info("频道已存在:" + channelId) + return + Channel.create(channelTitle=channelName, + channelId=channelId, channelLanguage=language) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='') + parser.add_argument('--db', type=str, default='') + parser.add_argument('--logDir', type=str, default='') + args = parser.parse_args() + Contant.db = args.db + Contant.logDir = args.logDir + initLogger() + ormInit() + Logger.info("SqlLite存放地址:"+Contant.db) + Logger.info("日志文件存放地址:"+Contant.logDir) + Logger.info("开始初始化...") + # checkInit() + # 读取txt文件获取需要的频道地址 + Logger.info("开始读取需要新增的频道地址...") + urlList = [] + # 打开文件 + for line in open("urlList.txt"): + line = line.strip('\n') + urlList.append(line) + # language = urlList[0] + for url_str in urlList: + if len(url_str) > 10: + url = url_str.split(" ")[0] + language = url_str.split(" ")[1] + Logger.info("url:{} ,language:{}", url, language) + saveChannel(url, language) diff --git a/init/urlList.txt b/init/urlList.txt new file mode 100644 index 0000000..db25eb4 --- /dev/null +++ b/init/urlList.txt @@ -0,0 +1,2 @@ +https://www.youtube.com/@easymoney380 en +https://www.youtube.com/@Groww en \ No newline at end of file diff --git a/init/urlList_en.txt b/init/urlList_en.txt new file mode 100644 index 0000000..4e7dc86 --- /dev/null +++ b/init/urlList_en.txt @@ -0,0 +1,3 @@ +en +https://www.youtube.com/@easymoney380 +https://www.youtube.com/@Groww diff --git a/init/urlList_india.txt b/init/urlList_india.txt new file mode 100644 index 0000000..170e079 --- /dev/null +++ b/init/urlList_india.txt @@ -0,0 +1,13 @@ +hi +https://www.youtube.com/@goela +https://www.youtube.com/@GoelaSchoolofFinanceShorts/featured +https://www.youtube.com/@InvestYadnya +https://www.youtube.com/@NDTVProfitIndia +https://www.youtube.com/@Neerajjoshi/featured +https://www.youtube.com/@thehimanichaudhary +https://www.youtube.com/@ADigitalBlogger +https://www.youtube.com/@stockburnerofficial +https://www.youtube.com/@nehanagar +https://www.youtube.com/@easymoney380 +https://www.youtube.com/@madhurokade +https://www.youtube.com/@stockmartpro \ No newline at end of file diff --git a/init/urlList_ja.txt b/init/urlList_ja.txt new file mode 100644 index 0000000..d8a06a4 --- /dev/null +++ b/init/urlList_ja.txt @@ -0,0 +1,7 @@ +ja +https://www.youtube.com/@ryogakucho +https://www.youtube.com/@DanTakahashi1 +https://www.youtube.com/@buffett_taro +https://www.youtube.com/@Tsubame104 +https://www.youtube.com/@inc_academy +https://www.youtube.com/@kamioka01 \ No newline at end of file diff --git a/sftp/Contant.py b/sftp/Contant.py new file mode 100644 index 0000000..b5f473a --- /dev/null +++ b/sftp/Contant.py @@ -0,0 +1 @@ +logDir="" \ No newline at end of file diff --git a/sftp/LoggerUtils.py b/sftp/LoggerUtils.py new file mode 100644 index 0000000..388b853 --- /dev/null +++ b/sftp/LoggerUtils.py @@ -0,0 +1,6 @@ +from loguru import logger +import Contant +Logger = logger +def initLogger(): + logger.add(Contant.logDir+"/sftp_{time}.log", rotation="500MB", encoding="utf-8", + enqueue=True, compression="zip", retention="10 days") diff --git a/sftp/sftp.py b/sftp/sftp.py new file mode 100644 index 0000000..b22761a --- /dev/null +++ b/sftp/sftp.py @@ -0,0 +1,97 @@ +import os +import shutil +import paramiko +import argparse +import Contant +from LoggerUtils import Logger, initLogger +import configparser +import requests +import time + +# python3 sftp.py --local="/mnt/tmp_srt_file" --logDir="./logs" +# python3 sftp.py --local="/mnt/test_file" --logDir="./logs" +if __name__ == "__main__": + # 读取参数 + parser = argparse.ArgumentParser(description="") + parser.add_argument("--local", type=str, default="") + parser.add_argument('--logDir', type=str, default='') + args = parser.parse_args() + Contant.logDir = args.logDir + initLogger() + + # 读取配置文件 + config = configparser.ConfigParser() + config.read('sftp_config.ini') + + # 获取SFTP配置信息 + hostname = config.get('sftp_config', 'hostname') + port = config.getint('sftp_config', 'port') + username = config.get('sftp_config', 'username') + password = config.get('sftp_config', 'password') + + Logger.info("host:{},port:{},username:{},password:{}".format( + hostname, port, username, password)) + + ssh_client = paramiko.SSHClient() + ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + sftp_client = None # 设置默认值 + ssh_client.connect(hostname, port, username, password) + # 创建SFTP客户端 + sftp_client = ssh_client.open_sftp() + Logger.info("SFTP客户端已经建立:{}".format(sftp_client)) + + remote_root = "/Inbound/YouTube Captions" + local_root = args.local + Logger.info("remote_root:{},local_root:{}".format(remote_root, local_root)) + + names = os.listdir(local_root) + for name in names: + # sftp创建文件夹 + try: + sftp_client.chdir(remote_root + "/" + name) + except BaseException: + sftp_client.mkdir(remote_root + "/" + name) + sftp_client.chdir(remote_root + "/" + name) + + # 遍历本地临时文件夹 + srtList = os.listdir(local_root + "/" + name) + for srt in srtList: + # 获取远程文件路径以及本地文件路径 + remotePath = remote_root + "/" + name + "/" + srt + localPath = local_root + "/" + name + "/" + srt + # 如果远程文件存在,则进行删除 + try: + sftp_client.stat(remotePath) + # 如果文件存在,删除它 + sftp_client.remove(remotePath) + Logger.info("Remote file '{}' deleted.".format(remotePath)) + except FileNotFoundError: + Logger.info("Remote file '{}' not found.".format(remotePath)) + # 上传本地文件 + try: + # 判断远程地址长度,过长需要截取一部分 + if len(remotePath) > 120: + remotePath = remotePath[:-20] + ".srt" + # 判断本地文件是否存在,存在则上传 + if os.path.exists(localPath): + Logger.info("本地文件 '{}' 存在,开始上传.".format(localPath)) + sftp_client.put(localPath, remotePath, confirm=False) + os.remove(localPath) + else: + Logger.info("本地文件 '{}' 不存在,无法上传.".format(localPath)) + except Exception as e: + Logger.info("上传失败 '{}' 文件名长度{}".format( + remotePath, len(remotePath))) + Logger.error(e) + sftp_client.close() + sftp_client = ssh_client.open_sftp() + # 发送钉钉消息 + webhook = "https://oapi.dingtalk.com/robot/send?access_token=c8c8d7d42c4eecd449dd303025ef968f647d1d8e8694e3fabc0ab5770d646dcb" + jsonData = { + "msgtype": "text", + "text": { + "content": "[Youtube]sftp finished" + } + } + requests.post(webhook, json=jsonData) + Logger.info("sftp发送钉钉消息成功...") diff --git a/sftp/sftp_config.ini b/sftp/sftp_config.ini new file mode 100644 index 0000000..125bc3f --- /dev/null +++ b/sftp/sftp_config.ini @@ -0,0 +1,5 @@ +[sftp_config] +hostname = filetransfer.blackrock.com +port = 22 +username = ftp_yunbo +password = s8v{8SJr diff --git a/src/ChannelService.py b/src/ChannelService.py new file mode 100644 index 0000000..3f74d22 --- /dev/null +++ b/src/ChannelService.py @@ -0,0 +1,15 @@ +import json +from Orm import Channel +from playhouse.shortcuts import model_to_dict, dict_to_model + + +class ChannelService: + def getOneByChannelId(channelId): + return Channel.get_or_none(Channel.channelId == channelId) + + def updateTimeByChannelId(channelId, chageTime): + Channel.update(channelReptileTime=chageTime).where( + Channel.channelId == channelId).execute() + + def getChannelList(): + return Channel.select().execute() diff --git a/src/Contant.py b/src/Contant.py new file mode 100644 index 0000000..3609b9d --- /dev/null +++ b/src/Contant.py @@ -0,0 +1,4 @@ +db="" +logDir="" +startTime="" +endTime="" \ No newline at end of file diff --git a/src/DownloadInfoService.py b/src/DownloadInfoService.py new file mode 100644 index 0000000..802e89f --- /dev/null +++ b/src/DownloadInfoService.py @@ -0,0 +1,16 @@ +from Orm import DownloadInfo + + +class DownloadService: + + def createOne(videoId, downloadType, tryTime, isFinished): + DownloadInfo.create( + videoId=videoId, + downloadType=downloadType, + tryTime=tryTime, + isFinished=isFinished + ) + + def updateInfoByVideoId(videoId, tryTime, isFinished): + DownloadInfo.update(tryTime=tryTime, isFinished=isFinished).where( + DownloadInfo.videoId == videoId).execute() diff --git a/src/LoggerUtils.py b/src/LoggerUtils.py new file mode 100644 index 0000000..90f071e --- /dev/null +++ b/src/LoggerUtils.py @@ -0,0 +1,6 @@ +from loguru import logger +import Contant +Logger = logger +def initLogger(): + logger.add(Contant.logDir+"/main_{time}.log", rotation="500MB", encoding="utf-8", + enqueue=True, compression="zip", retention="10 days") diff --git a/src/Orm.py b/src/Orm.py new file mode 100644 index 0000000..3dc0940 --- /dev/null +++ b/src/Orm.py @@ -0,0 +1,67 @@ +from peewee import * +import Contant +import argparse +from LoggerUtils import Logger + +parser = argparse.ArgumentParser(description='') +parser.add_argument('--db', type=str, default='') +parser.add_argument('--logDir', type=str, default='') +parser.add_argument("--start", type=str, default="") +parser.add_argument("--end", type=str, default="") +args = parser.parse_args() +Contant.db = args.db +db = SqliteDatabase(Contant.db) + + +def ormInit(): + Channel.create_table() + Video.create_table() + DownloadInfo.create_table() + + +class BaseModel(Model): + class Meta: + database = db + +# 频道信息 + + +class Channel(BaseModel): + id = PrimaryKeyField() + channelId = CharField(null=False) + channelTitle = CharField(null=False) + channelLanguage = CharField() + channelReptileTime = CharField(null=True) + + class Meta: + db_table = 'Channel' + +# 视频信息 + + +class Video(BaseModel): + id = PrimaryKeyField() + videoId = CharField(null=False) + channelId = CharField(null=False) + videoTitle = CharField() + videoLen = IntegerField() + videoType = CharField() + videoPublishTime = CharField() + videoLanguage = CharField() + isDownload = IntegerField() + + class Meta: + db_table = 'Vidoes' + +# 下载信息 + + +class DownloadInfo(BaseModel): + id = PrimaryKeyField() + videoId = CharField() + downloadType = IntegerField() + tryTime = IntegerField() + isFinished = IntegerField() + + class Meta: + db_table = 'Download_info' diff --git a/src/SrcTest.py b/src/SrcTest.py new file mode 100644 index 0000000..e69de29 diff --git a/src/VideoService.py b/src/VideoService.py new file mode 100644 index 0000000..3273712 --- /dev/null +++ b/src/VideoService.py @@ -0,0 +1,31 @@ +import json +from Orm import Video +from playhouse.shortcuts import model_to_dict, dict_to_model + + +class VideoService: + def getOneByVideoId(videoId): + return Video.get_or_none(Video.videoId == videoId) + + def createOne(videoId, channelId, videoTitle, videoLen, videoType, videoPublishTime, videoLanguage, isDownload): + Video.create(videoId=videoId, + channelId=channelId, + videoTitle=videoTitle, + videoLen=videoLen, + videoType=videoType, + videoPublishTime=videoPublishTime, + videoLanguage=videoLanguage, + isDownload=isDownload) + + def updateLenByVideoId(videoId, len): + Video.update(videoLen=len).where(Video.videoId == videoId).execute() + + def getLastVideoByChannelId(channelId): + return Video.select().where(Video.channelId == channelId).order_by(Video.videoPublishTime.desc()).get() + + def getFirstVideoByChannelId(channelId): + return Video.select().where(Video.channelId == channelId).order_by(Video.videoPublishTime).get() + + def checkExist(channelId): + query = Video.select().where(Video.channelId == channelId) + return query.exists() diff --git a/src/YouTubeUtils.py b/src/YouTubeUtils.py new file mode 100644 index 0000000..7165792 --- /dev/null +++ b/src/YouTubeUtils.py @@ -0,0 +1,164 @@ +import httplib2 +import googleapiclient.discovery +import googleapiclient.errors +from VideoService import VideoService +from ChannelService import ChannelService +from DownloadInfoService import DownloadService +from LoggerUtils import Logger +import operator +import time +import random + + +class YouTubeUtil: + # apiKeys = ["AIzaSyDlRgmPXVQEjF2gbmomI5FUZX_uAOBmEGI", "AIzaSyBI5i5vFZpQErMnEXKMf0VUS2Bel8jGrTk", + # "AIzaSyAnmA0Ggy1yXsZZACfItmeZAa7wcmh6SbM", "AIzaSyC4O8tBoAfkupmBybxDah2JUxgj4ct5uk0", + # "AIzaSyDJ2S9Ijhw_hULx3nHvPUoGUpMENbZOIl8", "AIzaSyA87Ckpna3hOQ31nISs8V8rp--OLw0m6Aw", + # "AIzaSyDIWbV0EOLHkOr9tWpANose6ggd2r9vcLg", "AIzaSyBKE3lYwWFIYc9Vx4YKMbRpkOXigZlY52U"] + # apiKeys = [ + + # "AIzaSyDJIKVldjWVeRSt3IBPAgredZsvldUDPhA", + # "AIzaSyChPXesnVx6fweon_BckhR6UiJWvi5Ma4s", + # "AIzaSyBI5i5vFZpQErMnEXKMf0VUS2Bel8jGrTk", + # "AIzaSyAnmA0Ggy1yXsZZACfItmeZAa7wcmh6SbM" + # ] + apiKeys = [] + apiIndex = 0 + + def getYoutube(): + proxy_info = httplib2.ProxyInfo( + proxy_type=httplib2.socks.PROXY_TYPE_HTTP, proxy_host="127.0.0.1", proxy_port=7890) + http = httplib2.Http(timeout=10, proxy_info=proxy_info, + disable_ssl_certificate_validation=False) + # http = httplib2.Http(timeout=10, disable_ssl_certificate_validation=False) + api_service_name = "youtube" + api_version = "v3" + # 获取apiKey + with open("api_key.txt", 'r') as file: + YouTubeUtil.apiKeys = file.readlines() + YouTubeUtil.apiIndex = random.randint(0, len(YouTubeUtil.apiKeys)-1) + apiKey = YouTubeUtil.apiKeys[YouTubeUtil.apiIndex].strip("\n") + Logger.info("当前APIKey:{},当前apiIndex:{}", apiKey, YouTubeUtil.apiIndex) + # apiKey = YouTubeUtil.apiKeys[YouTubeUtil.apiIndex] + # Logger.info( + # "当前APIKey:{},当前apiIndex:{},totalIndex:{}".format( + # apiKey, YouTubeUtil.apiIndex, len(YouTubeUtil.apiKeys) - 1 + # ) + # ) + # # 等于7,还原成0 + # if YouTubeUtil.apiIndex == (len(YouTubeUtil.apiKeys) - 1): + # YouTubeUtil.apiIndex = 0 + # else: + # YouTubeUtil.apiIndex = YouTubeUtil.apiIndex + 1 + + # 获取对象 + youtube = googleapiclient.discovery.build( + api_service_name, api_version, developerKey=apiKey, http=http + ) + return youtube + + def getVidoeLen(videoIds): + youtube = YouTubeUtil.getYoutube() + request = youtube.videos().list(part="contentDetails", id=videoIds) + response = request.execute() + response["items"][0]["contentDetails"] + return response + + def getVideoLenByStr(str): + len = 0 + str = str.split("PT")[1] + if operator.contains(str, "H"): + H = str.split("H")[0] + len = len + int(H) * 3600 + str = str.split("H")[1] + if operator.contains(str, "M"): + M = str.split("M")[0] + len = len + int(M) * 60 + str = str.split("M")[1] + if operator.contains(str, "S"): + S = str.split("S")[0] + len = len + int(S) + return len + + def getByChannelId(channelId, startTime, endTime): + channel = ChannelService.getOneByChannelId(channelId) + if channel == None: + return + videoLanguage = str(channel.channelLanguage) + youtube = YouTubeUtil.getYoutube() + request = youtube.search().list( + part="snippet", + channelId=channelId, + maxResults=50, + order="date", + publishedAfter=startTime, + publishedBefore=endTime, + type="video", + ) + response = request.execute() + while True: + videosRequest = "" + videosRequestCount = 0 + idList = [] + for i in response["items"]: + try: + videoId = i["id"]["videoId"] + publisTime = i["snippet"]["publishedAt"] + videoTitle = i["snippet"]["title"] + videoType = "video" + videoEntity = VideoService.getOneByVideoId(str(videoId)) + if videoEntity == None: + VideoService.createOne( + videoId, + channelId, + videoTitle, + 0, + videoType, + publisTime, + videoLanguage, + 0, + ) + DownloadService.createOne(videoId, 1, 0, 0) + videosRequest = videosRequest + "," + str(videoId) + videosRequestCount = videosRequestCount + 1 + Logger.info( + "存储VideoUrl:https://www.youtube.com/watch?v=" + videoId + ) + else: + Logger.info("已存在VideoId:{}".format(videoId)) + idList.append(str(videoId)) + if videosRequest != "" and videosRequestCount >= 10: + lenRes = YouTubeUtil.getVidoeLen(videosRequest) + for i in lenRes["items"]: + tmpId = i["id"] + videoLenStr = i["contentDetails"]["duration"] + videoLen = YouTubeUtil.getVideoLenByStr(videoLenStr) + VideoService.updateLenByVideoId(tmpId, videoLen) + Logger.info( + "更新时长,videoId:{},len:{}".format(tmpId, videoLen) + ) + videosRequestCount = 0 + videosRequest = "" + except: + pass + # 获取最后一个视频 + vidoeo = VideoService.getLastVideoByChannelId(channelId) + ChannelService.updateTimeByChannelId(channelId, vidoeo.videoPublishTime) + time.sleep(5) + try: + # youtube = YouTubeUtil.getYoutube + request = youtube.search().list( + part="snippet", + channelId=channelId, + maxResults=50, + order="date", + publishedAfter=startTime, + publishedBefore=endTime, + type="video", + pageToken=response["nextPageToken"], + ) + response = request.execute() + except Exception as e: + Logger.error(e) + print("no nextPageToken") + break diff --git a/src/api_key.txt b/src/api_key.txt new file mode 100644 index 0000000..c5106bf --- /dev/null +++ b/src/api_key.txt @@ -0,0 +1,4 @@ +AIzaSyDJIKVldjWVeRSt3IBPAgredZsvldUDPhA +AIzaSyChPXesnVx6fweon_BckhR6UiJWvi5Ma4s +AIzaSyBI5i5vFZpQErMnEXKMf0VUS2Bel8jGrTk +AIzaSyAnmA0Ggy1yXsZZACfItmeZAa7wcmh6SbM \ No newline at end of file diff --git a/src/channelList.txt b/src/channelList.txt new file mode 100644 index 0000000..e084794 --- /dev/null +++ b/src/channelList.txt @@ -0,0 +1 @@ +UCCLu5B_Ctsw4N20DJvDykOA 1 \ No newline at end of file diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..a5c8905 --- /dev/null +++ b/src/main.py @@ -0,0 +1,70 @@ +import argparse +import time +import random +import Contant +import LoggerUtils +import Orm +from VideoService import VideoService +from YouTubeUtils import YouTubeUtil +from ChannelService import ChannelService +import requests + +# py .\main.py --db=../db/youtube_prod.db --logDir=./logs --start="2021-03-14T00:00:01Z" --end="2024-03-14T00:00:01Z" +# py .\main.py --db=../db/youtube_prod.db --logDir=./logs --start="2024-03-14T00:00:01Z" --end="2024-04-25T00:00:01Z" +# python3 ./main.py --db="../db/youtube_prod.db" --logDir="./logs" --start="2023-08-10T00:00:01Z" --end="2023-09-12T00:00:01Z" +# python3 ./main.py --db="../db/youtube_prod.db" --logDir="./logs" --start="111" --end="222" +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="") + parser.add_argument("--db", type=str, default="") + parser.add_argument("--logDir", type=str, default="") + parser.add_argument("--start", type=str, default="") + parser.add_argument("--end", type=str, default="") + args = parser.parse_args() + Contant.db = args.db + Contant.logDir = args.logDir + Contant.startTime = args.start + Contant.endTime = args.end + LoggerUtils.initLogger() + Orm.ormInit() + LoggerUtils.Logger.info("db:{},logDir:{}".format(Contant.db, Contant.logDir)) + LoggerUtils.Logger.info("starTime:{},endTime:{}".format(Contant.startTime, Contant.endTime)) + # 通过文件读取apikeys + # for line in open("api_key.txt"): + # line = line.strip('\n') + # YouTubeUtil.apiKeys.append(line) + # LoggerUtils.Logger.info("YouTubeUtil.apiKeys:{}",YouTubeUtil.apiKeys) + # 读取文件获取需要获取的频道 + channelList = [] + for line in open("channelList.txt"): + line = line.strip('\n') + channelList.append(line) + for channel_str in channelList: + channelId = channel_str.split(" ")[0] + is_enable = channel_str.split(" ")[1] + if is_enable == "1": + LoggerUtils.Logger.info("channelId:{},startTime:{},endTime:{}".format(channelId, Contant.startTime, Contant.endTime)) + YouTubeUtil.getByChannelId(channelId, Contant.startTime, Contant.endTime) + sleep_time = random.randint(3, 10) + LoggerUtils.Logger.info("{}获取完毕,暂停{}秒", channelId, sleep_time) + time.sleep(sleep_time) + # 执行查询 + # channelList = ChannelService.getChannelList() + # LoggerUtils.Logger.info("list size:{}".format(len(channelList))) + # for channel in channelList: + # channelId = channel.channelId + # LoggerUtils.Logger.info( + # "channelId:{},startTime:{},endTime:{}".format( + # channelId, Contant.startTime, Contant.endTime + # ) + # ) + # YouTubeUtil.getByChannelId(channelId, Contant.startTime, Contant.endTime) + # 发送钉钉消息 + # webhook = "https://oapi.dingtalk.com/robot/send?access_token=c8c8d7d42c4eecd449dd303025ef968f647d1d8e8694e3fabc0ab5770d646dcb" + # jsonData = { + # "msgtype": "text", + # "text": { + # "content": "[Youtube]src finished" + # } + # } + # requests.post(webhook, json=jsonData) + # LoggerUtils.Logger.info("src发送钉钉消息成功...") diff --git a/start_download.sh b/start_download.sh new file mode 100644 index 0000000..c5150f5 --- /dev/null +++ b/start_download.sh @@ -0,0 +1,10 @@ +#!/bin/bash +function log() { + local time_now=`date '+%Y-%m-%d %H:%M:%S'` + echo "$time_now [download] [info] $1" >> /mnt/youtube_prod/running.log +} + +cd /mnt/youtube_prod/download +# /mnt/youtube_prod/start_download.sh +log "开始执行download..." +nohup python3 ./main_download.py --db="../db/youtube_prod.db" --logDir="./logs" >/dev/null 2>/mnt/youtube_prod/err.log & \ No newline at end of file diff --git a/start_sftp.sh b/start_sftp.sh new file mode 100644 index 0000000..8b93512 --- /dev/null +++ b/start_sftp.sh @@ -0,0 +1,11 @@ +#!/bin/bash +function log() { + local time_now=`date '+%Y-%m-%d %H:%M:%S'` + echo "$time_now [download] [info] $1" >> /mnt/youtube_prod/running.log +} + +cd /mnt/youtube_prod/sftp +# /mnt/youtube_prod/start_download.sh +log "开始执行sftp..." +python3 ./sftp.py --local="/mnt/tmp_srt_file" --logDir="./logs" +rm -rf /mnt/tmp_srt_file \ No newline at end of file diff --git a/start_src.sh b/start_src.sh new file mode 100644 index 0000000..94f7f5b --- /dev/null +++ b/start_src.sh @@ -0,0 +1,12 @@ +#!/bin/bash +function log() { + local time_now=`date '+%Y-%m-%d %H:%M:%S'` + echo "$time_now [src] [info] $1" >> /mnt/youtube_prod/running.log +} + +cd /mnt/youtube_prod/src +start=`date '+%Y-%m-%dT%H:%M:%SZ' -d'-1 day'` +end=`date '+%Y-%m-%dT%H:%M:%SZ'` +log "开始执行src...startTime:"$start",endTime:"$end +# /mnt/youtube_prod/start_src.sh +nohup python3 ./main.py --db="../db/youtube_prod.db" --logDir="./logs" --start=$start --end=$end >/dev/null 2>/mnt/youtube_prod/err.log & \ No newline at end of file diff --git a/stop_download.sh b/stop_download.sh new file mode 100644 index 0000000..2b0d345 --- /dev/null +++ b/stop_download.sh @@ -0,0 +1,4 @@ +#!/bin/bash +pid=`ps -ef | grep main_download | awk NR==1'{print $2}'` +echo $pid +kill -9 $pid \ No newline at end of file diff --git a/test.sh b/test.sh new file mode 100644 index 0000000..4e66871 --- /dev/null +++ b/test.sh @@ -0,0 +1,2 @@ +#!/bin/bash +echo "test" \ No newline at end of file diff --git a/test/test.py b/test/test.py new file mode 100644 index 0000000..4a2a350 --- /dev/null +++ b/test/test.py @@ -0,0 +1,42 @@ +# import httplib2 +# import googleapiclient.discovery +# import googleapiclient.errors + +# def getYoutube(): +# proxy_info = httplib2.ProxyInfo( +# proxy_type=httplib2.socks.PROXY_TYPE_HTTP, proxy_host="127.0.0.1", proxy_port=7890) +# http = httplib2.Http(timeout=10, proxy_info=proxy_info, +# disable_ssl_certificate_validation=False) +# # http = httplib2.Http(timeout=10, disable_ssl_certificate_validation=False) +# api_service_name = "youtube" +# api_version = "v3" +# # 获取apiKey +# apiKey = "AIzaSyARaW3mqO9szQiHgWZR4el0HWvdyheSHBc" +# # 获取对象 +# youtube = googleapiclient.discovery.build( +# api_service_name, api_version, developerKey=apiKey, http=http +# ) +# return youtube + + +# youtube = getYoutube() +# request = youtube.videos().list(part="statistics", id="9l7O_2KNomQ") +# response = request.execute() +# print(response) +# response = {'kind': 'youtube#videoListResponse', 'etag': 'I41mEoQqqiB5sxwKKu8X3wNWkB8', 'items': [{'kind': 'youtube#video', 'etag': 'mncS6_AC9-Y6HUjjt_A4ocpWVY4', 'id': '9l7O_2KNomQ', 'statistics': {'viewCount': '47212', 'likeCount': '2126', 'favoriteCount': '0', 'commentCount': '172'}}], 'pageInfo': {'totalResults': 1, 'resultsPerPage': 1}} +# print(response['items'][0]['statistics']['viewCount']) +countStr = "0" +for i in range(0,30): + if i != 29: + countStr = countStr + "," + "0" +print(countStr.split(",")) +list = countStr.split(",") +list[0] = 1 +print(list) +countStr = "" +for i in range(0,30): + if i != 29: + countStr = countStr + str(list[i]) + "," + else: + countStr = countStr + str(list[i]) +print(countStr) \ No newline at end of file diff --git a/test/test2.py b/test/test2.py new file mode 100644 index 0000000..0261397 --- /dev/null +++ b/test/test2.py @@ -0,0 +1,8 @@ +from youtube_transcript_api import YouTubeTranscriptApi +#zh-Hant +url = "https://www.youtube.com/watch?v=rhj42pLWa5s" +list = YouTubeTranscriptApi.list_transcripts("jtr9VBwwJ7M") +videoSrt = YouTubeTranscriptApi.get_transcript( + "KWlTphpCpcI", languages=['hi']) +print(list) +print(videoSrt) \ No newline at end of file diff --git a/view_count/ChannelService.py b/view_count/ChannelService.py new file mode 100644 index 0000000..0a57c70 --- /dev/null +++ b/view_count/ChannelService.py @@ -0,0 +1,15 @@ +import json +from Orm import Channel +from playhouse.shortcuts import model_to_dict, dict_to_model + + +class ChannelService: + def getOneByChannelId(channelId): + return Channel.get_or_none(Channel.channelId == channelId) + + def updateTimeByChannelId(channelId, chageTime): + Channel.update(channelReptileTime=chageTime).where( + Channel.channelId == channelId).execute() + + def getChannelList(): + return Channel.select().execute() \ No newline at end of file diff --git a/view_count/Contant.py b/view_count/Contant.py new file mode 100644 index 0000000..3609b9d --- /dev/null +++ b/view_count/Contant.py @@ -0,0 +1,4 @@ +db="" +logDir="" +startTime="" +endTime="" \ No newline at end of file diff --git a/view_count/LoggerUtils.py b/view_count/LoggerUtils.py new file mode 100644 index 0000000..90f071e --- /dev/null +++ b/view_count/LoggerUtils.py @@ -0,0 +1,6 @@ +from loguru import logger +import Contant +Logger = logger +def initLogger(): + logger.add(Contant.logDir+"/main_{time}.log", rotation="500MB", encoding="utf-8", + enqueue=True, compression="zip", retention="10 days") diff --git a/view_count/Orm.py b/view_count/Orm.py new file mode 100644 index 0000000..2286c3c --- /dev/null +++ b/view_count/Orm.py @@ -0,0 +1,75 @@ +from peewee import * +import Contant +import argparse +from LoggerUtils import Logger + +parser = argparse.ArgumentParser(description='') +parser.add_argument('--db', type=str, default='') +parser.add_argument('--logDir', type=str, default='') +args = parser.parse_args() +Contant.db = args.db +db = SqliteDatabase(Contant.db) + + +def ormInit(): + Channel.create_table() + Video.create_table() + DownloadInfo.create_table() + ViewCountInfo.create_table() + + +class BaseModel(Model): + class Meta: + database = db + +# 频道信息 + + +class Channel(BaseModel): + id = PrimaryKeyField() + channelId = CharField(null=False) + channelTitle = CharField(null=False) + channelLanguage = CharField() + channelReptileTime = CharField(null=True) + + class Meta: + db_table = 'Channel' + +# 视频信息 + + +class Video(BaseModel): + id = PrimaryKeyField() + videoId = CharField(null=False) + channelId = CharField(null=False) + videoTitle = CharField() + videoLen = IntegerField() + videoType = CharField() + videoPublishTime = CharField() + videoLanguage = CharField() + isDownload = IntegerField() + + class Meta: + db_table = 'Vidoes' + +# 下载信息 + + +class DownloadInfo(BaseModel): + id = PrimaryKeyField() + videoId = CharField() + downloadType = IntegerField() + tryTime = IntegerField() + isFinished = IntegerField() + + class Meta: + db_table = 'Download_info' + +# 播放量信息 +class ViewCountInfo(BaseModel): + id = PrimaryKeyField() + videoId = CharField() + viewCount = CharField() + + class Meta: + db_table = 'ViewCount_info' \ No newline at end of file diff --git a/view_count/VideoCountService.py b/view_count/VideoCountService.py new file mode 100644 index 0000000..4349a8f --- /dev/null +++ b/view_count/VideoCountService.py @@ -0,0 +1,33 @@ +import json +from Orm import ViewCountInfo +from playhouse.shortcuts import model_to_dict, dict_to_model + + +class ViewCountService: + def createOrUpdateOne(videoId, day,count): + query = ViewCountInfo.select().where(ViewCountInfo.videoId == videoId) + if not query: + countStr = "0" + for i in range(0,30): + if i != 29: + countStr = countStr + "," + "0" + list = countStr.split(",") + list[day-1] = count + countStr = "" + for i in range(0,30): + if i != 29: + countStr = countStr + str(list[i]) + "," + else: + countStr = countStr + str(list[i]) + ViewCountInfo.create(videoId=videoId, viewCount=countStr) + else: + viewCountInfo = ViewCountInfo.select().where(ViewCountInfo.videoId == videoId).get() + list = viewCountInfo.viewCount.split(",") + list[day-1] = count + countStr = "" + for i in range(0,30): + if i != 29: + countStr = countStr + str(list[i]) + "," + else: + countStr = countStr + str(list[i]) + ViewCountInfo.update(viewCount=countStr).where(ViewCountInfo.videoId == videoId).execute() \ No newline at end of file diff --git a/view_count/VideoService.py b/view_count/VideoService.py new file mode 100644 index 0000000..0b613a9 --- /dev/null +++ b/view_count/VideoService.py @@ -0,0 +1,34 @@ +import json +from Orm import Video +from playhouse.shortcuts import model_to_dict, dict_to_model + + +class VideoService: + def getOneByVideoId(videoId): + return Video.get_or_none(Video.videoId == videoId) + + def createOne(videoId, channelId, videoTitle, videoLen, videoType, videoPublishTime, videoLanguage, isDownload): + Video.create(videoId=videoId, + channelId=channelId, + videoTitle=videoTitle, + videoLen=videoLen, + videoType=videoType, + videoPublishTime=videoPublishTime, + videoLanguage=videoLanguage, + isDownload=isDownload) + + def updateLenByVideoId(videoId, len): + Video.update(videoLen=len).where(Video.videoId == videoId).execute() + + def getLastVideoByChannelId(channelId): + return Video.select().where(Video.channelId == channelId).order_by(Video.videoPublishTime.desc()).get() + + def getFirstVideoByChannelId(channelId): + return Video.select().where(Video.channelId == channelId).order_by(Video.videoPublishTime).get() + + def checkExist(channelId): + query = Video.select().where(Video.channelId == channelId) + return query.exists() + + def getVideosByTime(startTime,endTime): + return Video.select().where(Video.videoPublishTime >= startTime,Video.videoPublishTime <= endTime).execute() diff --git a/view_count/view_count_main.py b/view_count/view_count_main.py new file mode 100644 index 0000000..0f68ce2 --- /dev/null +++ b/view_count/view_count_main.py @@ -0,0 +1,79 @@ +import argparse +import random +import time +import Contant +from LoggerUtils import Logger, initLogger +import Orm +from VideoService import VideoService +from ChannelService import ChannelService +from VideoCountService import ViewCountService +from func_timeout import func_set_timeout +import func_timeout +import requests +import httplib2 +import googleapiclient.discovery +import googleapiclient.errors +import datetime + + +def getYoutube(): + proxy_info = httplib2.ProxyInfo( + proxy_type=httplib2.socks.PROXY_TYPE_HTTP, proxy_host="127.0.0.1", proxy_port=7890) + http = httplib2.Http(timeout=10, proxy_info=proxy_info, + disable_ssl_certificate_validation=False) + # http = httplib2.Http(timeout=10, disable_ssl_certificate_validation=False) + api_service_name = "youtube" + api_version = "v3" + # 获取apiKey + apiKey = "AIzaSyARaW3mqO9szQiHgWZR4el0HWvdyheSHBc" + # 获取对象 + youtube = googleapiclient.discovery.build( + api_service_name, api_version, developerKey=apiKey, http=http + ) + return youtube + + +def updateVideoViewCount(startTime, endTime): + list = VideoService.getVideosByTime(startTime, endTime) + videoCount = 0 + videosRequest = "" + youtube = getYoutube() + for video in list: + videoCount = videoCount + 1 + Logger.info(video.videoId) + videosRequest = videosRequest + "," + video.videoId + if videoCount == 30 or videoCount == len(list): + request = youtube.videos().list(part="statistics", id=videosRequest) + response = request.execute() + for item in response['items']: + Logger.info(item) + ViewCountService.createOrUpdateOne( + item['id'], 1, item['statistics']['viewCount']) + videosRequest = "" + videoCount = 0 + +# python ./view_count_main.py --db="../db/youtube_prod.db" --logDir="./logs" --start="2024-01-03T00:00:00Z" --end="2024-01-04T00:00:00Z" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='') + parser.add_argument('--db', type=str, default='') + parser.add_argument('--logDir', type=str, default='') + args = parser.parse_args() + Contant.db = args.db + Contant.logDir = args.logDir + initLogger() + Orm.ormInit() + # 查询30天内的所有视屏 + now = datetime.datetime.now() + zero_today = now.replace(hour=0, minute=0, second=0, microsecond=0) + end_today = now.replace(hour=23, minute=59, second=59, microsecond=0) + for i in range(1, 31): + startTime = zero_today+datetime.timedelta(days=-i) + endTime = end_today+datetime.timedelta(days=-i) + startTime = startTime.strftime("%y-%m-%dT%H:%S:%MZ") + endTime = endTime.strftime("%y-%m-%dT%H:%S:%MZ") + Logger.info("startTime:%s, endTime:%s" % (startTime, endTime)) + updateVideoViewCount(startTime, endTime) + # zero_today = zero_today.strftime("%y-%m-%dT%H:%S:%MZ") + # print(zero_today)