From d990b31ed213d6605f3a9f65564536450047fb30 Mon Sep 17 00:00:00 2001
From: zhangshu <appolli9527@163.com>
Date: Wed, 21 Aug 2024 22:51:39 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=A4=8D=E5=88=B6=E5=B7=B2?=
 =?UTF-8?q?=E6=9C=89srt=E6=96=87=E4=BB=B6=E7=9A=84=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore              |   2 +-
 .vscode/launch.json     |  45 ---------------
 parse_video.py          | 121 ++++++++++++++++++++++++++++++++++++++++
 parse_video_config.json |  15 +++++
 service/VideoService.py |   7 +++
 5 files changed, 144 insertions(+), 46 deletions(-)
 delete mode 100644 .vscode/launch.json
 create mode 100644 parse_video.py
 create mode 100644 parse_video_config.json

diff --git a/.gitignore b/.gitignore
index f8b73e7..f06f1b3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,4 +137,4 @@ dmypy.json
 
 # Cython debug symbols
 cython_debug/
-
+.vscode/
diff --git a/.vscode/launch.json b/.vscode/launch.json
deleted file mode 100644
index 8532091..0000000
--- a/.vscode/launch.json
+++ /dev/null
@@ -1,45 +0,0 @@
-{
-    // 使用 IntelliSense 了解相关属性。 
-    // 悬停以查看现有属性的描述。
-    // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "name": "init channel",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "init_channel.py",
-            "console": "integratedTerminal"
-        },
-        {
-            "name": "move_data",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "move_data.py",
-            "console": "integratedTerminal"
-        },
-        {
-            "name": "search video",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "search_video.py",
-            "console": "integratedTerminal",
-            "args": ["--start", "2023-09-10T00:00:01Z",
-                    "--end", "2023-09-11T00:00:01Z"]
-        },
-        {
-            "name": "download video",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "download_video.py",
-            "console": "integratedTerminal"
-        },
-        {
-            "name": "test",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "test.py",
-            "console": "integratedTerminal"
-        },
-    ]
-}
\ No newline at end of file
diff --git a/parse_video.py b/parse_video.py
new file mode 100644
index 0000000..4bc09d0
--- /dev/null
+++ b/parse_video.py
@@ -0,0 +1,121 @@
+import os
+import time
+from LoggerUtils import Logger, initLogger
+from bs4 import BeautifulSoup as bs
+from urllib.request import urlopen, Request
+import json
+import Contant
+from sqlalchemy import create_engine
+from entity.DownloadInfoEntity import DownloadInfo
+from entity.VideoEntity import Video
+from entity.ChannelEntity import Channel
+from service.DownloadInfoService import DownloadInfoService
+from service.VideoService import VideoService
+from common.YoutubeUtils import YouTubeUtil
+from common.DownloadUtils import DownloadUtil
+from service.ChannelService import ChannelService
+import operator
+import argparse
+import difflib
+from shutil import copyfile
+
+
+def get_all_files(directory):
+    """
+    递归获取目录下所有文件的路径
+    :param directory: 目录路径
+    :return: 文件路径列表
+    """
+    files = []
+    for root, dirs, filenames in os.walk(directory):
+        for filename in filenames:
+            # 将文件的完整路径添加到列表中
+            files.append(filename)
+    return files
+
+
+def getSrtFileName(video: Video):
+    videoTitle = video.videoTitle
+    videoTitle = videoTitle.replace("/", u"\u2215")
+    videoTitle = videoTitle.replace("?", "?")
+    videoTitle = videoTitle.replace("\\", "")
+    videoTitle = videoTitle.replace("|", "")
+    videoTitle = videoTitle.replace("<", "")
+    videoTitle = videoTitle.replace(">", "")
+    videoTitle = videoTitle.replace(":", "")
+    videoPublishTime = str(video.videoPublishTime)
+    videoPublishTime = str(videoPublishTime).split("T")[0]
+    languages = str(video.videoLanguage)
+    srtfileName = f'{videoPublishTime}-{languages}-{videoTitle}.srt'
+    return srtfileName
+
+
+def get_equal_rate_1(str1, str2):
+    return difflib.SequenceMatcher(None, str1, str2).quick_ratio()
+
+
+if __name__ == "__main__":
+    # 读取配置文件
+    with open('parse_video_config.json', 'r', encoding='utf-8') as f:
+        # 使用json.load()方法读取文件内容
+        data = json.load(f)
+
+    # 初始化日志
+    Contant.logDir = data['log']['dir']
+    Contant.logFileName = data['log']['fileName']
+    initLogger(Contant.logDir, Contant.logFileName)
+
+    # 连接mysql
+    dbHost = data['mysql']['host']
+    dbPort = data['mysql']['port']
+    dbUserName = data['mysql']['username']
+    dbPassword = data['mysql']['password']
+    dbDatabase = data['mysql']['database']
+    Logger.info("尝试连接mysql host:'{}' port:'{}' username:'{}' password:'{}' database:'{}'",
+                dbHost, dbPort, dbUserName, dbPassword, dbDatabase)
+    Contant.engin = create_engine(
+        f'mysql+mysqlconnector://{dbUserName}:{dbPassword}@{dbHost}:{dbPort}/{dbDatabase}')
+    Logger.info("连接mysql成功")
+
+    parseRoot = data['parse_root']
+    newSrtPaht = data['new_srt_path']
+
+    Logger.info(f'parseRoot: {parseRoot}')
+    chanelNameList = []
+    for root, dirs, filenames in os.walk(parseRoot):
+        channelName = str(root).replace(f"{parseRoot}/", "")
+        chanelNameList.append(channelName)
+
+     # 获取所有Channel
+    channel_dict = {}
+    channels = ChannelService.queryAllChannel()
+    for i in range(len(channels)):
+        channel: Channel = channels[i]
+        channelTitle = channel.channelTitle
+        for channelName in chanelNameList:
+            if get_equal_rate_1(channelTitle, channelName) > 0.9:
+                channel_dict[str(channel.channelId)
+                             ] = f"{parseRoot}/{channelName}"
+
+    # 遍历channel_dict,复制字幕文件
+    for key, value in channel_dict.items():
+        channel: Channel = ChannelService.queryOneByChannelId(key)
+        videos = VideoService.queryAllbyChannelId(key)
+        Logger.info(f"key: {key} len: {len(videos)}")
+        for i in range(len(videos)):
+            video: Video = videos[i]
+            srtFileName = getSrtFileName(video=video)
+            for root, dirs, filenames in os.walk(value):
+                for filename in filenames:
+                    if get_equal_rate_1(srtFileName, filename) > 0.8:
+                        src_path = f"{root}/{filename}"
+                        dst_path = f"{newSrtPaht}/{channel.region}/{channel.channelId}-{channel.channelTitle}"
+                        if not os.path.exists(dst_path):
+                            Logger.info("开始创建文件夹:" + dst_path)
+                            os.makedirs(dst_path)
+                        dst_path = f"{dst_path}/{video.videoId}.srt"
+                        Logger.info(f"src_path:{src_path} dst_path:{dst_path}")
+                        copyfile(src_path, dst_path)
+                        # 并且读取srt文件到数据库
+                        DownloadUtil.iterateSrt(
+                            srtFilePath=dst_path, videoId=video.videoId, channelId=channel.channelId)
diff --git a/parse_video_config.json b/parse_video_config.json
new file mode 100644
index 0000000..23e6d56
--- /dev/null
+++ b/parse_video_config.json
@@ -0,0 +1,15 @@
+{
+    "mysql": {
+        "host": "47.108.20.249",
+        "port": "3306",
+        "username": "root",
+        "password": "casino888!",
+        "database": "youtube"
+    },
+    "log": {
+        "dir": "./logs",
+        "fileName": "parse_video"
+    },
+    "parse_root": "E:/code/python/tmp_srt_file",
+    "new_srt_path": "/mnt/new_srt_path"
+}
\ No newline at end of file
diff --git a/service/VideoService.py b/service/VideoService.py
index 54c03c5..5019a2f 100644
--- a/service/VideoService.py
+++ b/service/VideoService.py
@@ -12,6 +12,13 @@ class VideoService:
         session.close()
         return video
 
+    def queryAllbyChannelId(channelId):
+        session = getSession()
+        videos = session.query(Video).filter(
+            Video.channelId == channelId).all()
+        session.close()
+        return videos
+
     def insertOne(videoId, ChannelId, videoTitle, videoLen, videoType, videoPublishTime, videoLanguage, isDownload):
         session = getSession()
         video: Video = Video(videoId=videoId, ChannelId=ChannelId, videoTitle=videoTitle,