You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
63 lines
2.1 KiB
63 lines
2.1 KiB
9 months ago
|
import time
|
||
|
from LoggerUtils import Logger, initLogger
|
||
|
import argparse
|
||
|
import Contant
|
||
|
from Orm import ormInit, Channel
|
||
|
import operator
|
||
|
from bs4 import BeautifulSoup as bs
|
||
|
from urllib.request import urlopen, Request
|
||
|
|
||
|
# py .\init.py --db=../db/youtube_prod.db --logDir=./logs
|
||
|
|
||
|
|
||
|
def saveChannel(channelUrl, language):
|
||
|
Logger.info("频道链接:"+channelUrl)
|
||
|
channelId = ""
|
||
|
channelName = ""
|
||
|
url_opener = urlopen(
|
||
|
Request(channelUrl, headers={'User-Agent': 'Mozilla'}))
|
||
|
videoInfo = bs(url_opener, features="html.parser")
|
||
|
links = videoInfo.find_all("link")
|
||
|
for link in links:
|
||
|
if operator.contains(str(link), "canonical"):
|
||
|
channelId = str(link['href']).split("/channel/")[1]
|
||
|
if operator.contains(str(link), "content="):
|
||
|
channelName = str(link['content'])
|
||
|
Logger.info("channelId:"+channelId)
|
||
|
Logger.info("channelName:"+channelName)
|
||
|
channel = Channel.get_or_none(Channel.channelId == channelId)
|
||
|
if channel != None:
|
||
|
Logger.info("频道已存在:" + channelId)
|
||
|
return
|
||
|
Channel.create(channelTitle=channelName,
|
||
|
channelId=channelId, channelLanguage=language)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
parser = argparse.ArgumentParser(description='')
|
||
|
parser.add_argument('--db', type=str, default='')
|
||
|
parser.add_argument('--logDir', type=str, default='')
|
||
|
args = parser.parse_args()
|
||
|
Contant.db = args.db
|
||
|
Contant.logDir = args.logDir
|
||
|
initLogger()
|
||
|
ormInit()
|
||
|
Logger.info("SqlLite存放地址:"+Contant.db)
|
||
|
Logger.info("日志文件存放地址:"+Contant.logDir)
|
||
|
Logger.info("开始初始化...")
|
||
|
# checkInit()
|
||
|
# 读取txt文件获取需要的频道地址
|
||
|
Logger.info("开始读取需要新增的频道地址...")
|
||
|
urlList = []
|
||
|
# 打开文件
|
||
|
for line in open("urlList.txt"):
|
||
|
line = line.strip('\n')
|
||
|
urlList.append(line)
|
||
|
# language = urlList[0]
|
||
|
for url_str in urlList:
|
||
|
if len(url_str) > 10:
|
||
|
url = url_str.split(" ")[0]
|
||
|
language = url_str.split(" ")[1]
|
||
|
Logger.info("url:{} ,language:{}", url, language)
|
||
|
saveChannel(url, language)
|