@@ -14,6 +14,7 @@ import (
1414 "github.com/PuerkitoBio/goquery"
1515 log "github.com/go-pkgz/lgr"
1616 "github.com/mauidude/go-readability"
17+ "github.com/sashabaranov/go-openai"
1718 "go.mongodb.org/mongo-driver/bson/primitive"
1819
1920 "github.com/ukeeper/ukeeper-redabilty/backend/datastore"
@@ -33,6 +34,7 @@ type UReadability struct {
3334 TimeOut time.Duration
3435 SnippetSize int
3536 Rules Rules
37+ OpenAIKey string
3638}
3739
3840// Response from api calls
@@ -128,6 +130,113 @@ func (f UReadability) Extract(ctx context.Context, reqURL string) (rb *Response,
128130 return rb , nil
129131}
130132
133+ // ContentParsedWrong handles the logic for when content is parsed incorrectly
134+ func (f UReadability ) ContentParsedWrong (ctx context.Context , urlStr string ) (string , error ) {
135+ // Extract content using the current method
136+ originalContent , err := f .Extract (ctx , urlStr )
137+ if err != nil {
138+ return "" , fmt .Errorf ("failed to extract content: %v" , err )
139+ }
140+
141+ // Get CSS selector from ChatGPT
142+ selector , err := f .getChatGPTSelector (ctx , urlStr )
143+ if err != nil {
144+ return "" , fmt .Errorf ("failed to get CSS selector: %v" , err )
145+ }
146+
147+ // Get the HTML body
148+ body , err := f .getHTMLBody (urlStr )
149+ if err != nil {
150+ return "" , fmt .Errorf ("failed to get HTML body: %v" , err )
151+ }
152+
153+ // Extract content using the new selector
154+ newContent , err := f .extractContentWithSelector (body , selector )
155+ if err != nil {
156+ return "" , fmt .Errorf ("failed to extract content with new selector: %v" , err )
157+ }
158+
159+ // Compare original and new content
160+ if strings .TrimSpace (originalContent .Content ) != strings .TrimSpace (newContent ) {
161+ // Contents are different, create a new rule
162+ rule := datastore.Rule {
163+ Author : "" ,
164+ Domain : f .extractDomain (urlStr ),
165+ Content : selector ,
166+ TestURLs : []string {urlStr },
167+ Enabled : true ,
168+ }
169+
170+ _ , err = f .Rules .Save (ctx , rule )
171+ if err != nil {
172+ return "" , fmt .Errorf ("failed to save new rule: %v" , err )
173+ }
174+
175+ return fmt .Sprintf ("new custom rule with DOM %s created" , selector ), nil
176+ }
177+
178+ return "default rule is good, no need to create the custom one" , nil
179+ }
180+
181+ func (f UReadability ) getChatGPTSelector (ctx context.Context , urlStr string ) (string , error ) {
182+ client := openai .NewClient (f .OpenAIKey )
183+ resp , err := client .CreateChatCompletion (
184+ ctx ,
185+ openai.ChatCompletionRequest {
186+ Model : openai .GPT4o ,
187+ Messages : []openai.ChatCompletionMessage {
188+ {
189+ Role : openai .ChatMessageRoleSystem ,
190+ Content : "You are a helpful assistant that provides CSS selectors for extracting main content from web pages." ,
191+ },
192+ {
193+ Role : openai .ChatMessageRoleUser ,
194+ Content : fmt .Sprintf ("Given the URL %s, identify the CSS selector that can be used to extract the main content of the article. This typically includes elements like 'article', 'main', or specific classes. Return only this selector and nothing else." , urlStr ),
195+ },
196+ },
197+ },
198+ )
199+
200+ if err != nil {
201+ return "" , err
202+ }
203+
204+ return resp .Choices [0 ].Message .Content , nil
205+ }
206+
207+ func (f UReadability ) getHTMLBody (urlStr string ) (string , error ) {
208+ resp , err := http .Get (urlStr )
209+ if err != nil {
210+ return "" , err
211+ }
212+ defer resp .Body .Close ()
213+
214+ body , err := io .ReadAll (resp .Body )
215+ if err != nil {
216+ return "" , err
217+ }
218+
219+ return string (body ), nil
220+ }
221+
222+ func (f UReadability ) extractContentWithSelector (body , selector string ) (string , error ) {
223+ doc , err := goquery .NewDocumentFromReader (strings .NewReader (body ))
224+ if err != nil {
225+ return "" , err
226+ }
227+
228+ content := doc .Find (selector ).Text ()
229+ return content , nil
230+ }
231+
232+ func (f UReadability ) extractDomain (urlStr string ) string {
233+ u , err := url .Parse (urlStr )
234+ if err != nil {
235+ return ""
236+ }
237+ return u .Hostname ()
238+ }
239+
131240// gets content from raw body string, both content (text only) and rich (with html tags)
132241func (f UReadability ) getContent (ctx context.Context , body , reqURL string ) (content , rich string , err error ) {
133242 // general parser
0 commit comments