Skip to content

Commit f74825d

Browse files
committed
Add OpenAI-powered content parsing improvement feature
1 parent 3d48955 commit f74825d

File tree

6 files changed

+175
-2
lines changed

6 files changed

+175
-2
lines changed

backend/extractor/readability.go

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"github.com/PuerkitoBio/goquery"
1515
log "github.com/go-pkgz/lgr"
1616
"github.com/mauidude/go-readability"
17+
"github.com/sashabaranov/go-openai"
1718
"go.mongodb.org/mongo-driver/bson/primitive"
1819

1920
"github.com/ukeeper/ukeeper-redabilty/backend/datastore"
@@ -33,6 +34,7 @@ type UReadability struct {
3334
TimeOut time.Duration
3435
SnippetSize int
3536
Rules Rules
37+
OpenAIKey string
3638
}
3739

3840
// Response from api calls
@@ -128,6 +130,113 @@ func (f UReadability) Extract(ctx context.Context, reqURL string) (rb *Response,
128130
return rb, nil
129131
}
130132

133+
// ContentParsedWrong handles the logic for when content is parsed incorrectly
134+
func (f UReadability) ContentParsedWrong(ctx context.Context, urlStr string) (string, error) {
135+
// Extract content using the current method
136+
originalContent, err := f.Extract(ctx, urlStr)
137+
if err != nil {
138+
return "", fmt.Errorf("failed to extract content: %v", err)
139+
}
140+
141+
// Get CSS selector from ChatGPT
142+
selector, err := f.getChatGPTSelector(ctx, urlStr)
143+
if err != nil {
144+
return "", fmt.Errorf("failed to get CSS selector: %v", err)
145+
}
146+
147+
// Get the HTML body
148+
body, err := f.getHTMLBody(urlStr)
149+
if err != nil {
150+
return "", fmt.Errorf("failed to get HTML body: %v", err)
151+
}
152+
153+
// Extract content using the new selector
154+
newContent, err := f.extractContentWithSelector(body, selector)
155+
if err != nil {
156+
return "", fmt.Errorf("failed to extract content with new selector: %v", err)
157+
}
158+
159+
// Compare original and new content
160+
if strings.TrimSpace(originalContent.Content) != strings.TrimSpace(newContent) {
161+
// Contents are different, create a new rule
162+
rule := datastore.Rule{
163+
Author: "",
164+
Domain: f.extractDomain(urlStr),
165+
Content: selector,
166+
TestURLs: []string{urlStr},
167+
Enabled: true,
168+
}
169+
170+
_, err = f.Rules.Save(ctx, rule)
171+
if err != nil {
172+
return "", fmt.Errorf("failed to save new rule: %v", err)
173+
}
174+
175+
return fmt.Sprintf("new custom rule with DOM %s created", selector), nil
176+
}
177+
178+
return "default rule is good, no need to create the custom one", nil
179+
}
180+
181+
func (f UReadability) getChatGPTSelector(ctx context.Context, urlStr string) (string, error) {
182+
client := openai.NewClient(f.OpenAIKey)
183+
resp, err := client.CreateChatCompletion(
184+
ctx,
185+
openai.ChatCompletionRequest{
186+
Model: openai.GPT4o,
187+
Messages: []openai.ChatCompletionMessage{
188+
{
189+
Role: openai.ChatMessageRoleSystem,
190+
Content: "You are a helpful assistant that provides CSS selectors for extracting main content from web pages.",
191+
},
192+
{
193+
Role: openai.ChatMessageRoleUser,
194+
Content: fmt.Sprintf("Given the URL %s, identify the CSS selector that can be used to extract the main content of the article. This typically includes elements like 'article', 'main', or specific classes. Return only this selector and nothing else.", urlStr),
195+
},
196+
},
197+
},
198+
)
199+
200+
if err != nil {
201+
return "", err
202+
}
203+
204+
return resp.Choices[0].Message.Content, nil
205+
}
206+
207+
func (f UReadability) getHTMLBody(urlStr string) (string, error) {
208+
resp, err := http.Get(urlStr)
209+
if err != nil {
210+
return "", err
211+
}
212+
defer resp.Body.Close()
213+
214+
body, err := io.ReadAll(resp.Body)
215+
if err != nil {
216+
return "", err
217+
}
218+
219+
return string(body), nil
220+
}
221+
222+
func (f UReadability) extractContentWithSelector(body, selector string) (string, error) {
223+
doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
224+
if err != nil {
225+
return "", err
226+
}
227+
228+
content := doc.Find(selector).Text()
229+
return content, nil
230+
}
231+
232+
func (f UReadability) extractDomain(urlStr string) string {
233+
u, err := url.Parse(urlStr)
234+
if err != nil {
235+
return ""
236+
}
237+
return u.Hostname()
238+
}
239+
131240
// gets content from raw body string, both content (text only) and rich (with html tags)
132241
func (f UReadability) getContent(ctx context.Context, body, reqURL string) (content, rich string, err error) {
133242
// general parser

backend/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ func main() {
4242
log.Fatalf("[ERROR] can't connect to mongo %v", err)
4343
}
4444
srv := rest.Server{
45-
Readability: extractor.UReadability{TimeOut: 30, SnippetSize: 300, Rules: db.GetStores()},
45+
Readability: extractor.UReadability{TimeOut: 30, SnippetSize: 300, Rules: db.GetStores(), OpenAIKey: opts.OpenAIKey},
4646
Token: opts.Token,
4747
Credentials: opts.Credentials,
4848
Version: revision,

backend/rest/server.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ func (s *Server) routes(frontendDir string) chi.Router {
8383
r.Get("/rule/{id}", s.getRuleByID)
8484
r.Get("/rules", s.getAllRules)
8585
r.Post("/auth", s.authFake)
86+
r.Get("/content-parsed-wrong", s.contentParsedWrong)
8687

8788
r.Group(func(protected chi.Router) {
8889
protected.Use(basicAuth("ureadability", s.Credentials))
@@ -288,6 +289,24 @@ func (s *Server) generateSummary(ctx context.Context, content string) (string, e
288289
return resp.Choices[0].Message.Content, nil
289290
}
290291

292+
func (s *Server) contentParsedWrong(w http.ResponseWriter, r *http.Request) {
293+
exampleURL := r.URL.Query().Get("url")
294+
if exampleURL == "" {
295+
render.Status(r, http.StatusBadRequest)
296+
render.JSON(w, r, JSON{"error": "url parameter is required"})
297+
return
298+
}
299+
300+
message, err := s.Readability.ContentParsedWrong(r.Context(), exampleURL)
301+
if err != nil {
302+
render.Status(r, http.StatusBadRequest)
303+
render.JSON(w, r, JSON{"error": err.Error()})
304+
return
305+
}
306+
307+
render.JSON(w, r, JSON{"message": message})
308+
}
309+
291310
func getBid(id string) primitive.ObjectID {
292311
bid, err := primitive.ObjectIDFromHex(id)
293312
if err != nil {

backend/web/index.html

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,12 @@
4343
</table>
4444
</div>
4545
</div>
46-
46+
<div class="content-parsed-wrong">
47+
<h2>Report Incorrectly Parsed Content</h2>
48+
<input type="text" id="url-input" placeholder="Enter URL">
49+
<button id="report-button">Report Incorrect Parsing</button>
50+
<div id="result-message"></div>
51+
</div>
4752
<div class="footer wrapper page__footer">
4853
<a href="#" class="link footer__copyright">uReadability</a>,
4954
<script>document.write((new Date()).getFullYear().toString());</script>

backend/web/main.css

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,4 +324,26 @@ img {
324324

325325
textarea {
326326
overflow: auto;
327+
}
328+
329+
.content-parsed-wrong {
330+
margin-top: 20px;
331+
padding: 20px;
332+
border: 1px solid #ccc;
333+
border-radius: 5px;
334+
}
335+
336+
#url-input {
337+
width: 300px;
338+
padding: 5px;
339+
margin-right: 10px;
340+
}
341+
342+
#report-button {
343+
padding: 5px 10px;
344+
}
345+
346+
#result-message {
347+
margin-top: 10px;
348+
font-weight: bold;
327349
}

backend/web/main.js

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,24 @@ document.addEventListener('DOMContentLoaded', () => {
232232
if (document.getElementById('rules__list')) {
233233
loadRules();
234234
}
235+
236+
document.getElementById('report-button').addEventListener('click', function () {
237+
const url = document.getElementById('url-input').value;
238+
if (!url) {
239+
alert('Please enter a URL');
240+
return;
241+
}
242+
243+
fetch(`/api/content-parsed-wrong?url=${encodeURIComponent(url)}`)
244+
.then(response => response.json())
245+
.then(data => {
246+
document.getElementById('result-message').textContent = data.message;
247+
})
248+
.catch(error => {
249+
console.error('Error:', error);
250+
document.getElementById('result-message').textContent = 'An error occurred. Please try again.';
251+
});
252+
});
235253
});
236254

237255
function loadRules() {

0 commit comments

Comments
 (0)