|
|
@ -4,6 +4,33 @@ from urllib.request import urlopen, Request |
|
|
|
import json |
|
|
|
import Contant |
|
|
|
from sqlalchemy import create_engine |
|
|
|
from entity.ChannelEntity import Channel |
|
|
|
from service.ChannelService import ChannelService |
|
|
|
import operator |
|
|
|
|
|
|
|
|
|
|
|
def saveChannel(channelUrl, language, region): |
|
|
|
Logger.info("频道链接:"+channelUrl) |
|
|
|
channelId = "" |
|
|
|
channelName = "" |
|
|
|
url_opener = urlopen( |
|
|
|
Request(channelUrl, headers={'User-Agent': 'Mozilla'})) |
|
|
|
videoInfo = bs(url_opener, features="html.parser") |
|
|
|
links = videoInfo.find_all("link") |
|
|
|
for link in links: |
|
|
|
if operator.contains(str(link), "canonical"): |
|
|
|
channelId = str(link['href']).split("/channel/")[1] |
|
|
|
if operator.contains(str(link), "content="): |
|
|
|
channelName = str(link['content']) |
|
|
|
Logger.info("channelId:"+channelId) |
|
|
|
Logger.info("channelName:"+channelName) |
|
|
|
channel: Channel = ChannelService.queryOneByChannelId(channelId) |
|
|
|
if channel: |
|
|
|
Logger.info("频道{}已存在".format(channelId)) |
|
|
|
return |
|
|
|
ChannelService.insertOneByValues( |
|
|
|
channelId=channelId, channelTitle=channelName, channelLanguage=language, region=region) |
|
|
|
|
|
|
|
|
|
|
|
# py .\init.py --db=../db/youtube_prod.db --logDir=./logs |
|
|
|
if __name__ == "__main__": |
|
|
@ -25,4 +52,19 @@ if __name__ == "__main__": |
|
|
|
dbDatabase = data['mysql']['database'] |
|
|
|
Logger.info("尝试连接mysql host:'{}' port:'{}' username:'{}' password:'{}' database:'{}'", |
|
|
|
dbHost, dbPort, dbUserName, dbPassword, dbDatabase) |
|
|
|
engine = create_engine(f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}') |
|
|
|
Contant.engin = create_engine( |
|
|
|
f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}') |
|
|
|
Logger.info("连接mysql成功") |
|
|
|
|
|
|
|
Logger.info("开始读取需要新增的频道地址...") |
|
|
|
urlList = [] |
|
|
|
# 打开文件 |
|
|
|
for line in open("urlList.txt"): |
|
|
|
line = line.strip('\n') |
|
|
|
urlList.append(line) |
|
|
|
language = urlList[0] |
|
|
|
region = urlList[1] |
|
|
|
Logger.info("language:{} region:{}".format(language, region)) |
|
|
|
for url in urlList: |
|
|
|
if len(url) > 20: |
|
|
|
saveChannel(url, language, region) |
|
|
|