Skip to content

Commit

Permalink
add tongyi speech to text
Browse files Browse the repository at this point in the history
  • Loading branch information
malinkang committed Jul 4, 2024
1 parent c977010 commit f60143f
Show file tree
Hide file tree
Showing 6 changed files with 444 additions and 153 deletions.
32 changes: 32 additions & 0 deletions .github/workflows/speech_text.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: speech to text

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
sync:
name: Sync
runs-on: ubuntu-latest
env:
NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
NOTION_PAGE: ${{ secrets.NOTION_PAGE }}
COOKIE: ${{ secrets.TONGYI_COOKIE }}
steps:
- name: Checkout
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: speech to text
run: |
python -u scripts/speech_text.py
41 changes: 0 additions & 41 deletions scripts/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,44 +38,3 @@
"链接": URL,
"收听时长": NUMBER,
}
{
"标题": {
"title": [
{
"type": "text",
"text": {"content": "Vol.224 金色梦乡:你知道人最强大的武器是什么吗?"},
}
]
},
"Description": {
"rich_text": [
{
"type": "text",
"text": {
"content": "本期节目我们一起读小说《金色梦乡》,作者伊坂幸太郎。\n《金色梦乡》出版于2007年,讲述了平凡的前送货员青柳雅春被突然当作刺杀首相的凶手,遭到政府通缉,同时被媒体炒作网暴,成为“十恶不赦的罪人”,因此唯一的出路只有拼命逃跑,在惊险的跑路中,与警方短兵相接,也得到情义相挺,莫名其妙的命运捉弄中,他能否顺利逃出重围?这个故事的灵感来自于真实历史事件“肯尼迪遇刺案”。\n伊坂幸太郎(1971-),日本作家。2000年以《奥杜邦的祈祷》获得“新潮推理俱乐部奖”,由此跻身文坛,曾五度入围“直木奖”,是公认的“文坛才子”。\n你会听到:\n1、什么是套路?\n2、看“原著”的意义是什么?\n3、《金色梦乡》和伊坂幸太郎简介。\n4、如何理解书中关于美国、摇滚、披头士、刺杀总统等意象?\n5、精彩片段分享。\n6、伊坂幸太郎的作品为什么畅销?怎么理解“人类最后的武器是信任”和标题《金色梦乡》?\n片头曲:靛厂\n片尾曲:Golden Slumbers (Remastered 2009)\n主播:大壹 / 超哥 / 星光"
},
}
]
},
"时间戳": {"number": 1712012400},
"发布时间": {
"date": {"start": "2024-04-02 07:00:00", "time_zone": "Asia/Shanghai"}
},
"音频": {
"rich_text": [
{
"type": "text",
"text": {
"content": "https://jt.ximalaya.com//GKwRINsJ3BQ4An6aiQK-6Qkb-aacv2-48K.m4a?channel=rss&album_id=29887212&track_id=718781905&uid=68693381&jt=https://audio.xmcdn.com/storages/e11f-audiofreehighqps/0B/16/GKwRINsJ3BQ4An6aiQK-6Qkb-aacv2-48K.m4a"
},
}
]
},
"Eid": {
"rich_text": [{"type": "text", "text": {"content": "660b3dad1c3c7de44a82f773"}}]
},
"时长": {"number": 5169},
"Podcast": {"relation": [{"id": "87723a05-dd9a-494d-a934-9ff4140fcb21"}]},
"链接": {"url": "hhttps://www.xiaoyuzhoufm.com/episode/660b3dad1c3c7de44a82f773"},
"状态": {"status": {"name": "在听"}},
}
27 changes: 24 additions & 3 deletions scripts/notion_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class NotionHelper:
"EPISODE_DATABASE_NAME": "Episode",
"ALL_DATABASE_NAME": "全部",
"AUTHOR_DATABASE_NAME": "Author",
"MINDMAP_DATABASE_NAME": "思维导图",
}
database_id_dict = {}
image_dict = {}
Expand All @@ -53,6 +54,9 @@ def __init__(self):
)
self.all_database_id = self.database_id_dict.get(
self.database_name_dict.get("ALL_DATABASE_NAME")
)
self.mindmap_database_id = self.database_id_dict.get(
self.database_name_dict.get("MINDMAP_DATABASE_NAME")
)

def extract_page_id(self, notion_url):
Expand Down Expand Up @@ -198,18 +202,35 @@ def delete_block(self, block_id):


@retry(stop_max_attempt_number=3, wait_fixed=5000)
def query_all(self, database_id, filter):
def query_all_by_filter(self, database_id, filter):
results = []
has_more = True
start_cursor = None
# while has_more:
response = self.client.databases.query(
database_id=database_id,
filter=filter,
start_cursor=start_cursor,
page_size=100,
)
start_cursor = response.get("next_cursor")
has_more = response.get("has_more")
results.extend(response.get("results"))
return results

@retry(stop_max_attempt_number=3, wait_fixed=5000)
def query_all(self, database_id):
"""获取database中所有的数据"""
results = []
has_more = True
start_cursor = None
while has_more:
response = self.client.databases.query(
database_id=database_id,
filter=filter,
start_cursor=start_cursor,
page_size=100,
)
start_cursor = response.get("next_cursor")
has_more = response.get("has_more")
results.extend(response.get("results"))
return results
return results
108 changes: 22 additions & 86 deletions scripts/podcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@
from dotenv import load_dotenv

load_dotenv()
DOUBAN_API_HOST = os.getenv("DOUBAN_API_HOST", "frodo.douban.com")
DOUBAN_API_KEY = os.getenv("DOUBAN_API_KEY", "0ac44ae016490db2204ce0a042db2916")

from config import (
movie_properties_type_dict,
book_properties_type_dict,
Expand Down Expand Up @@ -110,32 +107,6 @@ def get_episode(pid, timestamp):
return results


@retry(stop_max_attempt_number=3, wait_fixed=5000)
def get_history():
results = []
url = "https://api.xiaoyuzhoufm.com/v1/episode-played/list-history"
data = {
"limit": 25,
}
loadMoreKey = ""
while loadMoreKey is not None:
if loadMoreKey:
data["loadMoreKey"] = loadMoreKey
resp = requests.post(url, json=data, headers=headers)
if resp.ok:
loadMoreKey = resp.json().get("loadMoreKey")
d = resp.json().get("data")
for item in d:
episode = item.get("episode")
pubDate = pendulum.parse(episode.get("pubDate")).in_tz("UTC").int_timestamp
episode["pubDate"] = pubDate
results.append(episode)
else:
refresh_token()
raise Exception(f"Error {data} {resp.text}")
return results


def check_podcast(pid):
"""检查是否已经插入过"""
filter = {"property": "Pid", "rich_text": {"equals": pid}}
Expand All @@ -146,16 +117,6 @@ def check_podcast(pid):
return response["results"][0].get("id")


def check_eposide(eid):
"""检查是否已经插入过"""
filter = {"property": "Eid", "rich_text": {"equals": eid}}
response = notion_helper.query(
database_id=notion_helper.episode_database_id, filter=filter
)
if len(response["results"]) > 0:
return response["results"][0].get("id")


def get_timestamp(id):
"""检查是否已经插入过"""
filter = {"property": "Podcast", "relation": {"contains": id}}
Expand All @@ -176,17 +137,6 @@ def get_timestamp(id):
return 0


def delete():
"""删除未听的"""
filter = {"property": "状态", "status": {"equals": "未听"}}
results = notion_helper.query_all(
database_id=notion_helper.episode_database_id, filter=filter
)
for index,result in enumerate(results):
print(f"正在删除第{index+1}个,共{len(results)}个")
notion_helper.delete_block(block_id=result.get("id"))


def merge_podcast(list1, list2):
results = []
results.extend(list1)
Expand All @@ -198,10 +148,10 @@ def merge_podcast(list1, list2):


def insert_podcast():
refresh_token()
list1 = get_mileage()
list2 = get_podcast()
results = merge_podcast(list1, list2)
dict = {}
for index, result in enumerate(results):
podcast = {}
podcast["播客"] = result.get("title")
Expand All @@ -211,12 +161,11 @@ def insert_podcast():
podcast["收听时长"] = result.get("playedSeconds", 0)
podcast["Description"] = result.get("description")
podcast["链接"] = f"https://www.xiaoyuzhoufm.com/podcast/{result.get('pid')}"
if result.get("latestEpisodePubDate"):
podcast["最后更新时间"] = (
pendulum.parse(result.get("latestEpisodePubDate"))
.in_tz("UTC")
.int_timestamp
)
podcast["最后更新时间"] = (
pendulum.parse(result.get("latestEpisodePubDate"))
.in_tz("UTC")
.int_timestamp
)
cover = result.get("image").get("picUrl")
podcast["全部"] = [
notion_helper.get_relation_id(
Expand All @@ -239,36 +188,31 @@ def insert_podcast():
print(
f"正在同步 = {result.get('title')},共{len(results)}个播客,当前是第{index+1}个"
)

page_id = check_podcast(pid)
if page_id:
notion_helper.update_page(page_id=page_id, properties=properties)
else:
if not page_id:
page_id = notion_helper.create_page(
parent=parent, properties=properties, icon=get_icon(cover)
).get("id")
dict[pid] =(page_id, cover)
return dict
else:
notion_helper.update_page(page_id=page_id, properties=properties)
insert_episode(pid, page_id, cover)


def insert_episode(episodes, d):
episodes.sort(key=lambda x: x["pubDate"])
for index, result in enumerate(episodes):
pid = result.get("pid")
if pid not in d:
continue
def insert_episode(pid, page_id, cover):
timestamp = get_timestamp(page_id)
results = get_episode(pid, timestamp)
results.sort(key=lambda x: x["pubDate"])
for index, result in enumerate(results):
episode = {}
episode["标题"] = result.get("title")
episode["Description"] = result.get("description")
episode["时间戳"] = result.get("pubDate")
episode["发布时间"] = result.get("pubDate")
episode["音频"] = result.get("media").get("source").get("url")
eid = result.get("eid")
episode["Eid"] = eid

episode["Eid"] = result.get("eid")
episode["时长"] = result.get("duration")
episode["喜欢"] = result.get("isPicked")
episode["Podcast"] = [d.get(pid)[0]]
episode["Podcast"] = [page_id]
episode["链接"] = f"hhttps://www.xiaoyuzhoufm.com/episode/{result.get('eid')}"
status = "未听"
if result.get("isFinished"):
Expand All @@ -278,25 +222,17 @@ def insert_episode(episodes, d):
episode["状态"] = status
properties = utils.get_properties(episode, book_properties_type_dict)
print(
f"正在同步 = {result.get('title')},共{len(episodes)}个Episode,当前是第{index+1}个"
f"正在同步 = {result.get('title')},共{len(results)}个Episode,当前是第{index+1}个"
)
parent = {
"database_id": notion_helper.episode_database_id,
"type": "database_id",
}
page_id = check_eposide(eid)
if page_id:
notion_helper.update_page(page_id=page_id, properties=properties)
else:
notion_helper.create_page(
parent=parent, properties=properties, icon=get_icon(d.get(pid)[1])
)
notion_helper.create_page(
parent=parent, properties=properties, icon=get_icon(cover)
)


if __name__ == "__main__":
notion_helper = NotionHelper()
refresh_token()
d = insert_podcast()
episodes = get_history()
insert_episode(episodes, d)
delete()
insert_podcast()
Loading

0 comments on commit f60143f

Please sign in to comment.