from LoggerUtils import Logger, initLogger from bs4 import BeautifulSoup as bs from urllib.request import urlopen, Request import json import Contant from sqlalchemy import create_engine from entity.ChannelEntity import Channel from service.ChannelService import ChannelService import operator def saveChannel(channelUrl, language, region): Logger.info("频道链接:"+channelUrl) channelId = "" channelName = "" url_opener = urlopen( Request(channelUrl, headers={'User-Agent': 'Mozilla'})) videoInfo = bs(url_opener, features="html.parser") links = videoInfo.find_all("link") for link in links: if operator.contains(str(link), "canonical"): channelId = str(link['href']).split("/channel/")[1] if operator.contains(str(link), "content="): channelName = str(link['content']) Logger.info("channelId:"+channelId) Logger.info("channelName:"+channelName) channel: Channel = ChannelService.queryOneByChannelId(channelId) if channel: Logger.info("频道{}已存在".format(channelId)) return ChannelService.insertOneByValues( channelId=channelId, channelTitle=channelName, channelLanguage=language, region=region) # py .\init.py --db=../db/youtube_prod.db --logDir=./logs if __name__ == "__main__": # 读取配置文件 with open('init_channel_config.json', 'r', encoding='utf-8') as f: # 使用json.load()方法读取文件内容 data = json.load(f) # 初始化日志 Contant.logDir = data['log']['dir'] Contant.logFileName = data['log']['fileName'] initLogger(Contant.logDir, Contant.logFileName) # 连接mysql dbHost = data['mysql']['host'] dbPort = data['mysql']['port'] dbUserName = data['mysql']['username'] dbPassword = data['mysql']['password'] dbDatabase = data['mysql']['database'] Logger.info("尝试连接mysql host:'{}' port:'{}' username:'{}' password:'{}' database:'{}'", dbHost, dbPort, dbUserName, dbPassword, dbDatabase) Contant.engin = create_engine( f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}') Logger.info("连接mysql成功") Logger.info("开始读取需要新增的频道地址...") urlList = [] # 打开文件 for line in open("urlList.txt"): line = line.strip('\n') urlList.append(line) language = urlList[0] region = urlList[1] Logger.info("language:{} region:{}".format(language, region)) for url in urlList: if len(url) > 20: saveChannel(url, language, region)