@@ -394,6 +394,114 @@ func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule
394
394
return rb , nil
395
395
}
396
396
397
+ // ContentParsedWrong handles the logic for when content is parsed incorrectly
398
+ func (f * UReadability ) ContentParsedWrong (ctx context.Context , urlStr string ) (string , error ) {
399
+ // Extract content using the current method
400
+ originalContent , err := f .Extract (ctx , urlStr )
401
+ if err != nil {
402
+ return "" , fmt .Errorf ("failed to extract content: %v" , err )
403
+ }
404
+
405
+ // Get CSS selector from ChatGPT
406
+ selector , err := f .getChatGPTSelector (ctx , urlStr )
407
+ if err != nil {
408
+ return "" , fmt .Errorf ("failed to get CSS selector: %v" , err )
409
+ }
410
+
411
+ // Get the HTML body
412
+ body , err := f .getHTMLBody (urlStr )
413
+ if err != nil {
414
+ return "" , fmt .Errorf ("failed to get HTML body: %v" , err )
415
+ }
416
+
417
+ // Extract content using the new selector
418
+ newContent , err := f .extractContentWithSelector (body , selector )
419
+ if err != nil {
420
+ return "" , fmt .Errorf ("failed to extract content with new selector: %v" , err )
421
+ }
422
+
423
+ // Compare original and new content
424
+ if strings .TrimSpace (originalContent .Content ) != strings .TrimSpace (newContent ) {
425
+ // Contents are different, create a new rule
426
+ rule := datastore.Rule {
427
+ Author : "" ,
428
+ Domain : f .extractDomain (urlStr ),
429
+ Content : selector ,
430
+ TestURLs : []string {urlStr },
431
+ Enabled : true ,
432
+ }
433
+
434
+ _ , err = f .Rules .Save (ctx , rule )
435
+ if err != nil {
436
+ return "" , fmt .Errorf ("failed to save new rule: %v" , err )
437
+ }
438
+
439
+ return fmt .Sprintf ("new custom rule with DOM %s created" , selector ), nil
440
+ }
441
+
442
+ return "default rule is good, no need to create the custom one" , nil
443
+ }
444
+
445
+ func (f * UReadability ) getChatGPTSelector (ctx context.Context , urlStr string ) (string , error ) {
446
+ client := openai .NewClient (f .OpenAIKey )
447
+ resp , err := client .CreateChatCompletion (
448
+ ctx ,
449
+ openai.ChatCompletionRequest {
450
+ Model : openai .GPT4o ,
451
+ Messages : []openai.ChatCompletionMessage {
452
+ {
453
+ Role : openai .ChatMessageRoleSystem ,
454
+ Content : "You are a helpful assistant that provides CSS selectors for extracting main content from web pages." ,
455
+ },
456
+ {
457
+ Role : openai .ChatMessageRoleUser ,
458
+ Content : fmt .Sprintf ("Given the URL %s, identify the CSS selector that can be used to extract the main content of the article. This typically includes elements like 'article', 'main', or specific classes. Return only this selector and nothing else." , urlStr ),
459
+ },
460
+ },
461
+ },
462
+ )
463
+
464
+ if err != nil {
465
+ return "" , err
466
+ }
467
+
468
+ return resp .Choices [0 ].Message .Content , nil
469
+ }
470
+
471
+ func (f * UReadability ) getHTMLBody (urlStr string ) (string , error ) {
472
+ //nolint:gosec
473
+ resp , err := http .Get (urlStr )
474
+ if err != nil {
475
+ return "" , err
476
+ }
477
+ defer resp .Body .Close ()
478
+
479
+ body , err := io .ReadAll (resp .Body )
480
+ if err != nil {
481
+ return "" , err
482
+ }
483
+
484
+ return string (body ), nil
485
+ }
486
+
487
+ func (f * UReadability ) extractContentWithSelector (body , selector string ) (string , error ) {
488
+ doc , err := goquery .NewDocumentFromReader (strings .NewReader (body ))
489
+ if err != nil {
490
+ return "" , err
491
+ }
492
+
493
+ content := doc .Find (selector ).Text ()
494
+ return content , nil
495
+ }
496
+
497
+ func (f * UReadability ) extractDomain (urlStr string ) string {
498
+ u , err := url .Parse (urlStr )
499
+ if err != nil {
500
+ return ""
501
+ }
502
+ return u .Hostname ()
503
+ }
504
+
397
505
// getContent retrieves content from raw body string, both content (text only) and rich (with html tags)
398
506
// if rule is provided, it uses custom rule, otherwise tries to retrieve one from the storage,
399
507
// and at last tries to use general readability parser
0 commit comments