add tongyi speech to text

malinkang · Jul 4, 2024 · f60143f · f60143f
1 parent c977010
commit f60143f
Show file tree

Hide file tree

Showing 6 changed files with 444 additions and 153 deletions.
diff --git a/.github/workflows/speech_text.yml b/.github/workflows/speech_text.yml
@@ -0,0 +1,32 @@
+name: speech to text
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  sync:
+    name: Sync
+    runs-on: ubuntu-latest
+    env:
+        NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
+        NOTION_PAGE: ${{ secrets.NOTION_PAGE }}
+        COOKIE: ${{ secrets.TONGYI_COOKIE }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: speech to text
+        run: |
+          python -u scripts/speech_text.py
diff --git a/scripts/config.py b/scripts/config.py
@@ -38,44 +38,3 @@
     "链接": URL,
     "收听时长": NUMBER,
 }
-{
-    "标题": {
-        "title": [
-            {
-                "type": "text",
-                "text": {"content": "Vol.224 金色梦乡：你知道人最强大的武器是什么吗？"},
-            }
-        ]
-    },
-    "Description": {
-        "rich_text": [
-            {
-                "type": "text",
-                "text": {
-                    "content": "本期节目我们一起读小说《金色梦乡》，作者伊坂幸太郎。\n《金色梦乡》出版于2007年，讲述了平凡的前送货员青柳雅春被突然当作刺杀首相的凶手，遭到政府通缉，同时被媒体炒作网暴，成为“十恶不赦的罪人”，因此唯一的出路只有拼命逃跑，在惊险的跑路中，与警方短兵相接，也得到情义相挺，莫名其妙的命运捉弄中，他能否顺利逃出重围？这个故事的灵感来自于真实历史事件“肯尼迪遇刺案”。\n伊坂幸太郎（1971-），日本作家。2000年以《奥杜邦的祈祷》获得“新潮推理俱乐部奖”，由此跻身文坛，曾五度入围“直木奖”，是公认的“文坛才子”。\n你会听到：\n1、什么是套路？\n2、看“原著”的意义是什么？\n3、《金色梦乡》和伊坂幸太郎简介。\n4、如何理解书中关于美国、摇滚、披头士、刺杀总统等意象？\n5、精彩片段分享。\n6、伊坂幸太郎的作品为什么畅销？怎么理解“人类最后的武器是信任”和标题《金色梦乡》？\n片头曲：靛厂\n片尾曲：Golden Slumbers (Remastered 2009)\n主播：大壹 / 超哥 / 星光"
-                },
-            }
-        ]
-    },
-    "时间戳": {"number": 1712012400},
-    "发布时间": {
-        "date": {"start": "2024-04-02 07:00:00", "time_zone": "Asia/Shanghai"}
-    },
-    "音频": {
-        "rich_text": [
-            {
-                "type": "text",
-                "text": {
-                    "content": "https://jt.ximalaya.com//GKwRINsJ3BQ4An6aiQK-6Qkb-aacv2-48K.m4a?channel=rss&album_id=29887212&track_id=718781905&uid=68693381&jt=https://audio.xmcdn.com/storages/e11f-audiofreehighqps/0B/16/GKwRINsJ3BQ4An6aiQK-6Qkb-aacv2-48K.m4a"
-                },
-            }
-        ]
-    },
-    "Eid": {
-        "rich_text": [{"type": "text", "text": {"content": "660b3dad1c3c7de44a82f773"}}]
-    },
-    "时长": {"number": 5169},
-    "Podcast": {"relation": [{"id": "87723a05-dd9a-494d-a934-9ff4140fcb21"}]},
-    "链接": {"url": "hhttps://www.xiaoyuzhoufm.com/episode/660b3dad1c3c7de44a82f773"},
-    "状态": {"status": {"name": "在听"}},
-}
diff --git a/scripts/notion_helper.py b/scripts/notion_helper.py
@@ -31,6 +31,7 @@ class NotionHelper:
         "EPISODE_DATABASE_NAME": "Episode",
         "ALL_DATABASE_NAME": "全部",
         "AUTHOR_DATABASE_NAME": "Author",
+        "MINDMAP_DATABASE_NAME": "思维导图",
     }
     database_id_dict = {}
     image_dict = {}
@@ -53,6 +54,9 @@ def __init__(self):
         )      
         self.all_database_id = self.database_id_dict.get(
             self.database_name_dict.get("ALL_DATABASE_NAME")
+        )  
+        self.mindmap_database_id = self.database_id_dict.get(
+            self.database_name_dict.get("MINDMAP_DATABASE_NAME")
         )
 
     def extract_page_id(self, notion_url):
@@ -198,18 +202,35 @@ def delete_block(self, block_id):
 
 
     @retry(stop_max_attempt_number=3, wait_fixed=5000)
-    def query_all(self, database_id, filter):
+    def query_all_by_filter(self, database_id, filter):
+        results = []
+        has_more = True
+        start_cursor = None
+        # while has_more:
+        response = self.client.databases.query(
+            database_id=database_id,
+            filter=filter,
+            start_cursor=start_cursor,
+            page_size=100,
+        )
+        start_cursor = response.get("next_cursor")
+        has_more = response.get("has_more")
+        results.extend(response.get("results"))
+        return results
+
+    @retry(stop_max_attempt_number=3, wait_fixed=5000)
+    def query_all(self, database_id):
+        """获取database中所有的数据"""
         results = []
         has_more = True
         start_cursor = None
         while has_more:
             response = self.client.databases.query(
                 database_id=database_id,
-                filter=filter,
                 start_cursor=start_cursor,
                 page_size=100,
             )
             start_cursor = response.get("next_cursor")
             has_more = response.get("has_more")
             results.extend(response.get("results"))
-        return results
+        return results
diff --git a/scripts/podcast.py b/scripts/podcast.py
@@ -9,9 +9,6 @@
 from dotenv import load_dotenv
 
 load_dotenv()
-DOUBAN_API_HOST = os.getenv("DOUBAN_API_HOST", "frodo.douban.com")
-DOUBAN_API_KEY = os.getenv("DOUBAN_API_KEY", "0ac44ae016490db2204ce0a042db2916")
-
 from config import (
     movie_properties_type_dict,
     book_properties_type_dict,
@@ -110,32 +107,6 @@ def get_episode(pid, timestamp):
     return results
 
 
-@retry(stop_max_attempt_number=3, wait_fixed=5000)
-def get_history():
-    results = []
-    url = "https://api.xiaoyuzhoufm.com/v1/episode-played/list-history"
-    data = {
-        "limit": 25,
-    }
-    loadMoreKey = ""
-    while loadMoreKey is not None:
-        if loadMoreKey:
-            data["loadMoreKey"] = loadMoreKey
-        resp = requests.post(url, json=data, headers=headers)
-        if resp.ok:
-            loadMoreKey = resp.json().get("loadMoreKey")
-            d = resp.json().get("data")
-            for item in d:
-                episode = item.get("episode")
-                pubDate = pendulum.parse(episode.get("pubDate")).in_tz("UTC").int_timestamp
-                episode["pubDate"] = pubDate
-                results.append(episode)
-        else:
-            refresh_token()
-            raise Exception(f"Error {data} {resp.text}")
-    return results
-
-
 def check_podcast(pid):
     """检查是否已经插入过"""
     filter = {"property": "Pid", "rich_text": {"equals": pid}}
@@ -146,16 +117,6 @@ def check_podcast(pid):
         return response["results"][0].get("id")
 
 
-def check_eposide(eid):
-    """检查是否已经插入过"""
-    filter = {"property": "Eid", "rich_text": {"equals": eid}}
-    response = notion_helper.query(
-        database_id=notion_helper.episode_database_id, filter=filter
-    )
-    if len(response["results"]) > 0:
-        return response["results"][0].get("id")
-
-
 def get_timestamp(id):
     """检查是否已经插入过"""
     filter = {"property": "Podcast", "relation": {"contains": id}}
@@ -176,17 +137,6 @@ def get_timestamp(id):
     return 0
 
 
-def delete():
-    """删除未听的"""
-    filter = {"property": "状态", "status": {"equals": "未听"}}
-    results = notion_helper.query_all(
-        database_id=notion_helper.episode_database_id, filter=filter
-    )
-    for index,result in enumerate(results):
-        print(f"正在删除第{index+1}个，共{len(results)}个")
-        notion_helper.delete_block(block_id=result.get("id"))
-
-
 def merge_podcast(list1, list2):
     results = []
     results.extend(list1)
@@ -198,10 +148,10 @@ def merge_podcast(list1, list2):
 
 
 def insert_podcast():
+    refresh_token()
     list1 = get_mileage()
     list2 = get_podcast()
     results = merge_podcast(list1, list2)
-    dict = {}
     for index, result in enumerate(results):
         podcast = {}
         podcast["播客"] = result.get("title")
@@ -211,12 +161,11 @@ def insert_podcast():
         podcast["收听时长"] = result.get("playedSeconds", 0)
         podcast["Description"] = result.get("description")
         podcast["链接"] = f"https://www.xiaoyuzhoufm.com/podcast/{result.get('pid')}"
-        if result.get("latestEpisodePubDate"):
-            podcast["最后更新时间"] = (
-                pendulum.parse(result.get("latestEpisodePubDate"))
-                .in_tz("UTC")
-                .int_timestamp
-            )
+        podcast["最后更新时间"] = (
+            pendulum.parse(result.get("latestEpisodePubDate"))
+            .in_tz("UTC")
+            .int_timestamp
+        )
         cover = result.get("image").get("picUrl")
         podcast["全部"] = [
             notion_helper.get_relation_id(
@@ -239,36 +188,31 @@ def insert_podcast():
         print(
             f"正在同步 = {result.get('title')}，共{len(results)}个播客，当前是第{index+1}个"
         )
-
         page_id = check_podcast(pid)
-        if page_id:
-            notion_helper.update_page(page_id=page_id, properties=properties)
-        else:
+        if not page_id:
             page_id = notion_helper.create_page(
                 parent=parent, properties=properties, icon=get_icon(cover)
             ).get("id")
-        dict[pid] =(page_id, cover)
-    return dict
+        else:
+            notion_helper.update_page(page_id=page_id, properties=properties)
+        insert_episode(pid, page_id, cover)
 
 
-def insert_episode(episodes, d):
-    episodes.sort(key=lambda x: x["pubDate"])
-    for index, result in enumerate(episodes):
-        pid = result.get("pid")
-        if pid not in d:
-            continue
+def insert_episode(pid, page_id, cover):
+    timestamp = get_timestamp(page_id)
+    results = get_episode(pid, timestamp)
+    results.sort(key=lambda x: x["pubDate"])
+    for index, result in enumerate(results):
         episode = {}
         episode["标题"] = result.get("title")
         episode["Description"] = result.get("description")
         episode["时间戳"] = result.get("pubDate")
         episode["发布时间"] = result.get("pubDate")
         episode["音频"] = result.get("media").get("source").get("url")
-        eid = result.get("eid")
-        episode["Eid"] = eid
-
+        episode["Eid"] = result.get("eid")
         episode["时长"] = result.get("duration")
         episode["喜欢"] = result.get("isPicked")
-        episode["Podcast"] = [d.get(pid)[0]]
+        episode["Podcast"] = [page_id]
         episode["链接"] = f"hhttps://www.xiaoyuzhoufm.com/episode/{result.get('eid')}"
         status = "未听"
         if result.get("isFinished"):
@@ -278,25 +222,17 @@ def insert_episode(episodes, d):
         episode["状态"] = status
         properties = utils.get_properties(episode, book_properties_type_dict)
         print(
-            f"正在同步 = {result.get('title')}，共{len(episodes)}个Episode，当前是第{index+1}个"
+            f"正在同步 = {result.get('title')}，共{len(results)}个Episode，当前是第{index+1}个"
         )
         parent = {
             "database_id": notion_helper.episode_database_id,
             "type": "database_id",
         }
-        page_id = check_eposide(eid)
-        if page_id:
-            notion_helper.update_page(page_id=page_id, properties=properties)
-        else:
-            notion_helper.create_page(
-                parent=parent, properties=properties, icon=get_icon(d.get(pid)[1])
-            )
+        notion_helper.create_page(
+            parent=parent, properties=properties, icon=get_icon(cover)
+        )
 
 
 if __name__ == "__main__":
     notion_helper = NotionHelper()
-    refresh_token()
-    d = insert_podcast()
-    episodes = get_history()
-    insert_episode(episodes, d)
-    delete()
+    insert_podcast()