Skip to content

Commit 9e9a926

Browse files
committed
Add OpenAI-powered content parsing improvement feature
1 parent 8db4950 commit 9e9a926

File tree

2 files changed

+139
-0
lines changed

2 files changed

+139
-0
lines changed

backend/extractor/readability.go

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,114 @@ func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule
394394
return rb, nil
395395
}
396396

397+
// ContentParsedWrong handles the logic for when content is parsed incorrectly
398+
func (f *UReadability) ContentParsedWrong(ctx context.Context, urlStr string) (string, error) {
399+
// Extract content using the current method
400+
originalContent, err := f.Extract(ctx, urlStr)
401+
if err != nil {
402+
return "", fmt.Errorf("failed to extract content: %v", err)
403+
}
404+
405+
// Get CSS selector from ChatGPT
406+
selector, err := f.getChatGPTSelector(ctx, urlStr)
407+
if err != nil {
408+
return "", fmt.Errorf("failed to get CSS selector: %v", err)
409+
}
410+
411+
// Get the HTML body
412+
body, err := f.getHTMLBody(urlStr)
413+
if err != nil {
414+
return "", fmt.Errorf("failed to get HTML body: %v", err)
415+
}
416+
417+
// Extract content using the new selector
418+
newContent, err := f.extractContentWithSelector(body, selector)
419+
if err != nil {
420+
return "", fmt.Errorf("failed to extract content with new selector: %v", err)
421+
}
422+
423+
// Compare original and new content
424+
if strings.TrimSpace(originalContent.Content) != strings.TrimSpace(newContent) {
425+
// Contents are different, create a new rule
426+
rule := datastore.Rule{
427+
Author: "",
428+
Domain: f.extractDomain(urlStr),
429+
Content: selector,
430+
TestURLs: []string{urlStr},
431+
Enabled: true,
432+
}
433+
434+
_, err = f.Rules.Save(ctx, rule)
435+
if err != nil {
436+
return "", fmt.Errorf("failed to save new rule: %v", err)
437+
}
438+
439+
return fmt.Sprintf("new custom rule with DOM %s created", selector), nil
440+
}
441+
442+
return "default rule is good, no need to create the custom one", nil
443+
}
444+
445+
func (f *UReadability) getChatGPTSelector(ctx context.Context, urlStr string) (string, error) {
446+
client := openai.NewClient(f.OpenAIKey)
447+
resp, err := client.CreateChatCompletion(
448+
ctx,
449+
openai.ChatCompletionRequest{
450+
Model: openai.GPT4o,
451+
Messages: []openai.ChatCompletionMessage{
452+
{
453+
Role: openai.ChatMessageRoleSystem,
454+
Content: "You are a helpful assistant that provides CSS selectors for extracting main content from web pages.",
455+
},
456+
{
457+
Role: openai.ChatMessageRoleUser,
458+
Content: fmt.Sprintf("Given the URL %s, identify the CSS selector that can be used to extract the main content of the article. This typically includes elements like 'article', 'main', or specific classes. Return only this selector and nothing else.", urlStr),
459+
},
460+
},
461+
},
462+
)
463+
464+
if err != nil {
465+
return "", err
466+
}
467+
468+
return resp.Choices[0].Message.Content, nil
469+
}
470+
471+
func (f *UReadability) getHTMLBody(urlStr string) (string, error) {
472+
//nolint:gosec
473+
resp, err := http.Get(urlStr)
474+
if err != nil {
475+
return "", err
476+
}
477+
defer resp.Body.Close()
478+
479+
body, err := io.ReadAll(resp.Body)
480+
if err != nil {
481+
return "", err
482+
}
483+
484+
return string(body), nil
485+
}
486+
487+
func (f *UReadability) extractContentWithSelector(body, selector string) (string, error) {
488+
doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
489+
if err != nil {
490+
return "", err
491+
}
492+
493+
content := doc.Find(selector).Text()
494+
return content, nil
495+
}
496+
497+
func (f *UReadability) extractDomain(urlStr string) string {
498+
u, err := url.Parse(urlStr)
499+
if err != nil {
500+
return ""
501+
}
502+
return u.Hostname()
503+
}
504+
397505
// getContent retrieves content from raw body string, both content (text only) and rich (with html tags)
398506
// if rule is provided, it uses custom rule, otherwise tries to retrieve one from the storage,
399507
// and at last tries to use general readability parser

backend/rest/server.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ func (s *Server) routes(frontendDir string) http.Handler {
8080
api.HandleFunc("POST /extract", s.extractArticle)
8181
api.HandleFunc("POST /auth", s.authFake)
8282
api.HandleFunc("GET /metrics", s.handleMetrics)
83+
api.HandleFunc("GET /content-parsed-wrong", s.contentParsedWrong)
8384

8485
// add protected group with its own set of middlewares
8586
protectedGroup := api.Group()
@@ -224,6 +225,15 @@ func (s *Server) extractArticleEmulateReadability(w http.ResponseWriter, r *http
224225
res.Summary = summaryText
225226
}
226227

228+
if summary {
229+
summaryText, err := s.Readability.GenerateSummary(r.Context(), res.Content)
230+
if err != nil {
231+
rest.SendErrorJSON(w, r, log.Default(), http.StatusInternalServerError, err, fmt.Sprintf("failed to generate summary: %v", err))
232+
return
233+
}
234+
res.Summary = summaryText
235+
}
236+
227237
rest.RenderJSON(w, &res)
228238
}
229239

@@ -409,6 +419,27 @@ func (s *Server) handleMetrics(w http.ResponseWriter, _ *http.Request) {
409419
})
410420
}
411421

422+
func (s *Server) contentParsedWrong(w http.ResponseWriter, r *http.Request) {
423+
if s.Readability.OpenAIKey == "" {
424+
rest.SendErrorJSON(w, r, log.Default(), http.StatusBadRequest, nil, "OpenAI key is not set")
425+
return
426+
}
427+
428+
exampleURL := r.URL.Query().Get("url")
429+
if exampleURL == "" {
430+
rest.SendErrorJSON(w, r, log.Default(), http.StatusBadRequest, nil, "url parameter is required")
431+
return
432+
}
433+
434+
message, err := s.Readability.ContentParsedWrong(r.Context(), exampleURL)
435+
if err != nil {
436+
rest.SendErrorJSON(w, r, log.Default(), http.StatusInternalServerError, err, err.Error())
437+
return
438+
}
439+
440+
rest.RenderJSON(w, JSON{"message": message})
441+
}
442+
412443
func getBid(id string) primitive.ObjectID {
413444
bid, err := primitive.ObjectIDFromHex(id)
414445
if err != nil {

0 commit comments

Comments
 (0)