Skip to content

Commit f3f7467

Browse files
Merge pull request #20 from TheExplainthis/develop
Summary of Youtube video and news website
2 parents 9fa2ae9 + 24419f6 commit f3f7467

File tree

7 files changed

+185
-7
lines changed

7 files changed

+185
-7
lines changed

README.en.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77

88
## Update
9+
- 2023/03/23 Update summary of Youtube videos and news articles (supports: United Daily News, SET, Yahoo News, Central News Agency, Storm Media, TVBS, Liberty Times, ETtoday, China Times, Line News, TTV News)
910
- 2023/03/18 Added Whisper service, users can now add their own tokens, and added command (refer to the documentation below)
1011
- 2023/03/03 Model change to chat completion: `gpt-3.5-turbo`
1112

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77

88
## 更新
9+
- 2023/03/23 更新總結 Youtube 影片內容、新聞文章(支援:聯合報、Yahoo 新聞、三立新聞網、中央通訊社、風傳媒、TVBS、自由時報、ETtoday、中時新聞網、Line 新聞、台視新聞網)
910
- 2023/03/18 新增 Whipser 服務、用戶可以新增自己的 Token、新增指令(參考文件下方)
1011
- 2023/03/03 模型換成 chat completion: `gpt-3.5-turbo`
1112

main.py

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,18 @@
1717
from src.logger import logger
1818
from src.storage import Storage
1919
from src.utils import get_role_and_content
20+
from src.service.youtube import Youtube, YoutubeTranscriptReader
21+
from src.service.website import Website, WebsiteReader
2022

2123
load_dotenv('.env')
2224

2325
app = Flask(__name__)
2426
line_bot_api = LineBotApi(os.getenv('LINE_CHANNEL_ACCESS_TOKEN'))
2527
handler = WebhookHandler(os.getenv('LINE_CHANNEL_SECRET'))
2628
storage = Storage('db.json')
29+
youtube = Youtube(step=4)
30+
website = Website()
31+
2732

2833
memory = Memory(system_message=os.getenv('SYSTEM_MESSAGE'), memory_message_count=2)
2934
model_management = {}
@@ -86,14 +91,37 @@ def handle_text_message(event):
8691
memory.append(user_id, 'assistant', url)
8792

8893
else:
94+
user_model = model_management[user_id]
8995
memory.append(user_id, 'user', text)
90-
is_successful, response, error_message = model_management[user_id].chat_completions(memory.get(user_id), os.getenv('OPENAI_MODEL_ENGINE'))
91-
if not is_successful:
92-
raise Exception(error_message)
93-
role, response = get_role_and_content(response)
94-
msg = TextSendMessage(text=response)
96+
url = website.get_url_from_text(text)
97+
if url:
98+
if youtube.retrieve_video_id(text):
99+
is_successful, chunks, error_message = youtube.get_transcript_chunks(youtube.retrieve_video_id(text))
100+
if not is_successful:
101+
raise Exception(error_message)
102+
youtube_transcript_reader = YoutubeTranscriptReader(user_model, os.getenv('OPENAI_MODEL_ENGINE'))
103+
is_successful, response, error_message = youtube_transcript_reader.summarize(chunks)
104+
if not is_successful:
105+
raise Exception(error_message)
106+
role, response = get_role_and_content(response)
107+
msg = TextSendMessage(text=response)
108+
else:
109+
chunks = website.get_content_from_url(url)
110+
if len(chunks) == 0:
111+
raise Exception('無法撈取此網站文字')
112+
website_reader = WebsiteReader(user_model, os.getenv('OPENAI_MODEL_ENGINE'))
113+
is_successful, response, error_message = website_reader.summarize(chunks)
114+
if not is_successful:
115+
raise Exception(error_message)
116+
role, response = get_role_and_content(response)
117+
msg = TextSendMessage(text=response)
118+
else:
119+
is_successful, response, error_message = user_model.chat_completions(memory.get(user_id), os.getenv('OPENAI_MODEL_ENGINE'))
120+
if not is_successful:
121+
raise Exception(error_message)
122+
role, response = get_role_and_content(response)
123+
msg = TextSendMessage(text=response)
95124
memory.append(user_id, role, response)
96-
97125
except ValueError:
98126
msg = TextSendMessage(text='Token 無效,請重新註冊,格式為 /註冊 sk-xxxxx')
99127
except KeyError:
@@ -134,6 +162,8 @@ def handle_audio_message(event):
134162
msg = TextSendMessage(text=response)
135163
except ValueError:
136164
msg = TextSendMessage(text='請先註冊你的 API Token,格式為 /註冊 [API TOKEN]')
165+
except KeyError:
166+
msg = TextSendMessage(text='請先註冊 Token,格式為 /註冊 sk-xxxxx')
137167
except Exception as e:
138168
memory.remove(user_id)
139169
if str(e).startswith('Incorrect API key provided'):

requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
line-bot-sdk==2.4.1
22
python-dotenv==0.21.1
33
Flask==2.2.2
4-
opencc-python-reimplemented==0.1.4
4+
opencc-python-reimplemented==0.1.4
5+
beautifulsoup4==4.11.2
6+
youtube-transcript-api==0.5.0

src/service/__init__.py

Whitespace-only changes.

src/service/website.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import os
2+
import re
3+
import requests
4+
from bs4 import BeautifulSoup
5+
6+
7+
WEBSITE_SYSTEM_MESSAGE = "你現在非常擅於做資料的整理、總結、歸納、統整,並能專注於細節、且能提出觀點"
8+
WEBSITE_MESSAGE_FORMAT = """
9+
針對這個連結的內容:
10+
\"\"\"
11+
{}
12+
\"\"\"
13+
14+
請關注幾個點:
15+
1. 他的主題為何?
16+
2. 他的重點為何?
17+
3. 他獨特的觀點為何?
18+
19+
你需要回傳的格式是:
20+
- 主題: '...'
21+
- 重點: '...'
22+
- 獨特觀點: '...'
23+
"""
24+
25+
26+
class Website:
27+
def get_url_from_text(self, text: str):
28+
url_regex = re.compile(r'https?://\S+')
29+
match = re.search(url_regex, text)
30+
if match:
31+
return match.group()
32+
else:
33+
return None
34+
35+
def get_content_from_url(self, url: str):
36+
hotpage = requests.get(url)
37+
main = BeautifulSoup(hotpage.text, 'html.parser')
38+
chunks = [article.text.strip() for article in main.find_all('article')]
39+
if chunks == []:
40+
chunks = [article.text.strip() for article in main.find_all('div', class_='content')]
41+
return chunks
42+
43+
44+
class WebsiteReader:
45+
def __init__(self, model=None, model_engine=None):
46+
self.system_message = os.getenv('WEBSITE_SYSTEM_MESSAGE') or WEBSITE_SYSTEM_MESSAGE
47+
self.message_format = os.getenv('WEBSITE_MESSAGE_FORMAT') or WEBSITE_MESSAGE_FORMAT
48+
self.model = model
49+
self.text_length_limit = 1800
50+
self.model_engine = model_engine
51+
52+
def send_msg(self, msg):
53+
return self.model.chat_completions(msg, self.model_engine)
54+
55+
def summarize(self, chunks):
56+
text = '\n'.join(chunks)[:self.text_length_limit]
57+
msgs = [{
58+
"role": "system", "content": self.system_message
59+
}, {
60+
"role": "user", "content": self.message_format.format(text)
61+
}]
62+
return self.send_msg(msgs)

src/service/youtube.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import math
2+
import os
3+
import re
4+
from src.utils import get_role_and_content
5+
6+
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
7+
8+
9+
YOUTUBE_SYSTEM_MESSAGE = "你現在非常擅於做資料的整理、總結、歸納、統整,並能專注於細節、且能提出觀點"
10+
PART_MESSAGE_FORMAT = """ PART {} START
11+
下面是一個 Youtube 影片的部分字幕: \"\"\"{}\"\"\" \n\n請總結出這部影片的重點與一些細節,字數約 100 字左右
12+
PART {} END
13+
"""
14+
WHOLE_MESSAGE_FORMAT = "下面是每一個部分的小結論:\"\"\"{}\"\"\" \n\n 請給我全部小結論的總結,字數約 100 字左右"
15+
SINGLE_MESSAGE_FORMAT = "下面是一個 Youtube 影片的字幕: \"\"\"{}\"\"\" \n\n請總結出這部影片的重點與一些細節,字數約 100 字左右"
16+
17+
18+
class Youtube:
19+
def __init__(self, step):
20+
self.step = step
21+
self.chunk_size = 150
22+
23+
def get_transcript_chunks(self, video_id):
24+
try:
25+
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['zh-TW', 'zh', 'ja', 'zh-Hant', 'zh-Hans', 'en', 'ko'])
26+
text = [t.get('text') for i, t in enumerate(transcript) if i % self.step == 0]
27+
chunks = ['\n'.join(text[i*self.chunk_size: (i+1)*self.chunk_size]) for i in range(math.ceil(len(text) / self.chunk_size))]
28+
except NoTranscriptFound:
29+
return False, [], '目前只支援:中文、英文、日文、韓文'
30+
except TranscriptsDisabled:
31+
return False, [], '本影片無開啟字幕功能'
32+
except Exception as e:
33+
return False, [], str(e)
34+
return True, chunks, None
35+
36+
def retrieve_video_id(self, url):
37+
regex = r'(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})'
38+
match = re.search(regex, url)
39+
if match:
40+
return match.group(1)
41+
else:
42+
return None
43+
44+
45+
class YoutubeTranscriptReader:
46+
def __init__(self, model=None, model_engine=None):
47+
self.summary_system_prompt = os.getenv('YOUTUBE_SYSTEM_MESSAGE') or YOUTUBE_SYSTEM_MESSAGE
48+
self.part_message_format = os.getenv('PART_MESSAGE_FORMAT') or PART_MESSAGE_FORMAT
49+
self.whole_message_format = os.getenv('WHOLE_MESSAGE_FORMAT') or WHOLE_MESSAGE_FORMAT
50+
self.single_message_format = os.getenv('SINGLE_MESSAGE_FORMAT') or SINGLE_MESSAGE_FORMAT
51+
self.model = model
52+
self.model_engine = model_engine
53+
54+
def send_msg(self, msg):
55+
return self.model.chat_completions(msg, self.model_engine)
56+
57+
def summarize(self, chunks):
58+
summary_msg = []
59+
if len(chunks) > 1:
60+
for i, chunk in enumerate(chunks):
61+
msgs = [{
62+
"role": "system", "content": self.summary_system_prompt
63+
}, {
64+
"role": "user", "content": self.part_message_format.format(i, chunk, i)
65+
}]
66+
_, response, _ = self.send_msg(msgs)
67+
_, content = get_role_and_content(response)
68+
summary_msg.append(content)
69+
text = '\n'.join(summary_msg)
70+
msgs = [{
71+
'role': 'system', 'content': self.summary_system_prompt
72+
}, {
73+
'role': 'user', 'content': self.whole_message_format.format(text)
74+
}]
75+
else:
76+
text = chunks[0]
77+
msgs = [{
78+
'role': 'system', 'content': self.summary_system_prompt
79+
}, {
80+
'role': 'user', 'content': self.single_message_format.format(text)
81+
}]
82+
return self.send_msg(msgs)

0 commit comments

Comments
 (0)