Skip to content

Commit 7229311

Browse files
committed
Add article summary feature with OpenAI integration
- Introduce 'summary' query parameter in /api/content/v1/parser endpoint - Integrate OpenAI API for generating article summaries - Add OpenAIKey field to Server struct and corresponding command-line flag - Update extractArticleEmulateReadability to handle summary requests - Add generateSummary method using OpenAI's GPT-4o model (turns out to be faster than even 4o mini) - Add OpenAIClient interface and mock for testing - Update README.md with new configuration options and API details This feature allows users to request a summary of extracted articles using OpenAI's GPT-4o model. To ensure secure usage, summary generation requires a valid server token. The changes include comprehensive error handling and test coverage for various scenarios, including token validation and server misconfiguration.
1 parent 9642f97 commit 7229311

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+6948
-12
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
| address | UKEEPER_ADDRESS | all interfaces | web server listening address |
1212
| port | UKEEPER_PORT | `8080` | web server port |
1313
| mongo_uri | MONGO_URI | none | MongoDB connection string, _required_ |
14+
| openai_key | OPENAI_KEY | none | OpenAI API key for summary generation |
1415
| frontend_dir | FRONTEND_DIR | `/srv/web` | directory with frontend files |
1516
| token | TOKEN | none | token for /content/v1/parser endpoint auth |
1617
| mongo-delay | MONGO_DELAY | `0` | mongo initial delay |
@@ -20,7 +21,7 @@
2021

2122
### API
2223

23-
GET /api/content/v1/parser?token=secret&url=http://aa.com/blah - extract content (emulate Readability API parse call)
24+
GET /api/content/v1/parser?token=secret&summary=true&url=http://aa.com/blah - extract content (emulate Readability API parse call), summary is optional and requires OpenAI key and token to be enabled
2425
POST /api/v1/extract {url: http://aa.com/blah} - extract content
2526

2627
## Development

backend/extractor/openai_mock.go

Lines changed: 82 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

backend/extractor/pics.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import (
1111
log "github.com/go-pkgz/lgr"
1212
)
1313

14-
func (f UReadability) extractPics(iselect *goquery.Selection, url string) (mainImage string, allImages []string, ok bool) {
14+
func (f *UReadability) extractPics(iselect *goquery.Selection, url string) (mainImage string, allImages []string, ok bool) {
1515
images := make(map[int]string)
1616

1717
type imgInfo struct {
@@ -58,7 +58,7 @@ func (f UReadability) extractPics(iselect *goquery.Selection, url string) (mainI
5858
}
5959

6060
// getImageSize loads image to get size
61-
func (f UReadability) getImageSize(url string) (size int) {
61+
func (f *UReadability) getImageSize(url string) (size int) {
6262
httpClient := &http.Client{Timeout: time.Second * 30}
6363
req, err := http.NewRequest("GET", url, nil)
6464
if err != nil {

backend/extractor/readability.go

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,17 @@ import (
1414
"github.com/PuerkitoBio/goquery"
1515
log "github.com/go-pkgz/lgr"
1616
"github.com/mauidude/go-readability"
17+
"github.com/sashabaranov/go-openai"
1718
"go.mongodb.org/mongo-driver/bson/primitive"
1819

1920
"github.com/ukeeper/ukeeper-redabilty/backend/datastore"
2021
)
2122

23+
//go:generate moq -out openai_mock.go . OpenAIClient
24+
type OpenAIClient interface {
25+
CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (openai.ChatCompletionResponse, error)
26+
}
27+
2228
// Rules interface with all methods to access datastore
2329
type Rules interface {
2430
Get(ctx context.Context, rURL string) (datastore.Rule, bool)
@@ -33,10 +39,14 @@ type UReadability struct {
3339
TimeOut time.Duration
3440
SnippetSize int
3541
Rules Rules
42+
OpenAIKey string
43+
44+
openAIClient OpenAIClient
3645
}
3746

3847
// Response from api calls
3948
type Response struct {
49+
Summary string `json:"summary,omitempty"`
4050
Content string `json:"content"`
4151
Rich string `json:"rich_content"`
4252
Domain string `json:"domain"`
@@ -59,17 +69,48 @@ var (
5969
const userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15"
6070

6171
// Extract fetches page and retrieves article
62-
func (f UReadability) Extract(ctx context.Context, reqURL string) (*Response, error) {
72+
func (f *UReadability) Extract(ctx context.Context, reqURL string) (*Response, error) {
6373
return f.extractWithRules(ctx, reqURL, nil)
6474
}
6575

6676
// ExtractByRule fetches page and retrieves article using a specific rule
67-
func (f UReadability) ExtractByRule(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
77+
func (f *UReadability) ExtractByRule(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
6878
return f.extractWithRules(ctx, reqURL, rule)
6979
}
7080

81+
func (f *UReadability) GenerateSummary(ctx context.Context, content string) (string, error) {
82+
if f.OpenAIKey == "" {
83+
return "", fmt.Errorf("OpenAI key is not set")
84+
}
85+
if f.openAIClient == nil {
86+
f.openAIClient = openai.NewClient(f.OpenAIKey)
87+
}
88+
resp, err := f.openAIClient.CreateChatCompletion(
89+
ctx,
90+
openai.ChatCompletionRequest{
91+
Model: openai.GPT4o,
92+
Messages: []openai.ChatCompletionMessage{
93+
{
94+
Role: openai.ChatMessageRoleSystem,
95+
Content: "You are a helpful assistant that summarizes articles. Please summarize the main points in a few sentences as TLDR style (don't add a TLDR label). Then, list up to five detailed bullet points. Provide the response in plain text. Do not add any additional information. Do not add a Summary at the beginning of the response. If detailed bullet points are too similar to the summary, don't include them at all:",
96+
},
97+
{
98+
Role: openai.ChatMessageRoleUser,
99+
Content: content,
100+
},
101+
},
102+
},
103+
)
104+
105+
if err != nil {
106+
return "", err
107+
}
108+
109+
return resp.Choices[0].Message.Content, nil
110+
}
111+
71112
// ExtractWithRules is the core function that handles extraction with or without a specific rule
72-
func (f UReadability) extractWithRules(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
113+
func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
73114
log.Printf("[INFO] extract %s", reqURL)
74115
rb := &Response{}
75116

@@ -140,7 +181,7 @@ func (f UReadability) extractWithRules(ctx context.Context, reqURL string, rule
140181
// getContent retrieves content from raw body string, both content (text only) and rich (with html tags)
141182
// if rule is provided, it uses custom rule, otherwise tries to retrieve one from the storage,
142183
// and at last tries to use general readability parser
143-
func (f UReadability) getContent(ctx context.Context, body, reqURL string, rule *datastore.Rule) (content, rich string, err error) {
184+
func (f *UReadability) getContent(ctx context.Context, body, reqURL string, rule *datastore.Rule) (content, rich string, err error) {
144185
// general parser
145186
genParser := func(body, _ string) (content, rich string, err error) {
146187
doc, err := readability.NewDocument(body)
@@ -192,7 +233,7 @@ func (f UReadability) getContent(ctx context.Context, body, reqURL string, rule
192233
}
193234

194235
// makes all links absolute and returns all found links
195-
func (f UReadability) normalizeLinks(data string, reqContext *http.Request) (result string, links []string) {
236+
func (f *UReadability) normalizeLinks(data string, reqContext *http.Request) (result string, links []string) {
196237
absoluteLink := func(link string) (absLink string, changed bool) {
197238
if r, err := reqContext.URL.Parse(link); err == nil {
198239
return r.String(), r.String() != link

backend/extractor/readability_test.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ import (
1111
"testing"
1212
"time"
1313

14+
"github.com/sashabaranov/go-openai"
1415
"github.com/stretchr/testify/assert"
16+
"github.com/stretchr/testify/require"
1517
"go.mongodb.org/mongo-driver/bson/primitive"
1618

1719
"github.com/ukeeper/ukeeper-redabilty/backend/datastore"
@@ -176,3 +178,61 @@ func TestGetContentCustom(t *testing.T) {
176178
assert.Equal(t, 6988, len(content))
177179
assert.Equal(t, 7169, len(rich))
178180
}
181+
182+
func TestUReadability_GenerateSummary(t *testing.T) {
183+
mockOpenAI := &OpenAIClientMock{
184+
CreateChatCompletionFunc: func(ctx context.Context, request openai.ChatCompletionRequest) (openai.ChatCompletionResponse, error) {
185+
return openai.ChatCompletionResponse{
186+
Choices: []openai.ChatCompletionChoice{
187+
{
188+
Message: openai.ChatCompletionMessage{
189+
Content: "This is a summary of the article.",
190+
},
191+
},
192+
},
193+
}, nil
194+
},
195+
}
196+
197+
tests := []struct {
198+
name string
199+
content string
200+
openAIKey string
201+
expectedResult string
202+
expectedError string
203+
}{
204+
{
205+
name: "Valid OpenAI Key and content",
206+
content: "This is a test article content.",
207+
openAIKey: "test-key",
208+
expectedResult: "This is a summary of the article.",
209+
expectedError: "",
210+
},
211+
{
212+
name: "No OpenAI Key",
213+
content: "This is a test article content.",
214+
openAIKey: "",
215+
expectedResult: "",
216+
expectedError: "OpenAI key is not set",
217+
},
218+
}
219+
220+
for _, tt := range tests {
221+
t.Run(tt.name, func(t *testing.T) {
222+
readability := UReadability{
223+
OpenAIKey: tt.openAIKey,
224+
openAIClient: mockOpenAI,
225+
}
226+
227+
result, err := readability.GenerateSummary(context.Background(), tt.content)
228+
229+
if tt.expectedError != "" {
230+
require.Error(t, err)
231+
assert.Contains(t, err.Error(), tt.expectedError)
232+
} else {
233+
require.NoError(t, err)
234+
assert.Equal(t, tt.expectedResult, result)
235+
}
236+
})
237+
}
238+
}

backend/extractor/text.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import (
1212
)
1313

1414
// get clean text from html content
15-
func (f UReadability) getText(content, title string) string {
15+
func (f *UReadability) getText(content, title string) string {
1616
cleanText := sanitize.HTML(content)
1717
cleanText = strings.Replace(cleanText, title, "", 1) // get rid of title in snippet
1818
cleanText = strings.ReplaceAll(cleanText, "\t", " ")
@@ -32,7 +32,7 @@ func (f UReadability) getText(content, title string) string {
3232
}
3333

3434
// get snippet from clean text content
35-
func (f UReadability) getSnippet(cleanText string) string {
35+
func (f *UReadability) getSnippet(cleanText string) string {
3636
cleanText = strings.ReplaceAll(cleanText, "\n", " ")
3737
size := len([]rune(cleanText))
3838
if size > f.SnippetSize {
@@ -50,7 +50,7 @@ func (f UReadability) getSnippet(cleanText string) string {
5050
}
5151

5252
// detect encoding, content type and convert content to utf8
53-
func (f UReadability) toUtf8(content []byte, header http.Header) (contentType, origEncoding, result string) {
53+
func (f *UReadability) toUtf8(content []byte, header http.Header) (contentType, origEncoding, result string) {
5454
getContentTypeAndEncoding := func(str string) (contentType, encoding string) { // from "text/html; charset=windows-1251"
5555
elems := strings.Split(str, ";")
5656
contentType = strings.TrimSpace(elems[0])

backend/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ require (
1515
github.com/jessevdk/go-flags v1.6.1
1616
github.com/kennygrant/sanitize v1.2.4
1717
github.com/mauidude/go-readability v0.0.0-20220221173116-a9b3620098b7
18+
github.com/sashabaranov/go-openai v1.32.0
1819
github.com/stretchr/testify v1.9.0
1920
go.mongodb.org/mongo-driver v1.16.1
2021
golang.org/x/net v0.28.0

backend/go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,10 @@ github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6So
183183
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
184184
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
185185
github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
186+
github.com/sashabaranov/go-openai v1.28.1 h1:aREx6faUTeOZNMDTNGAY8B9vNmmN7qoGvDV0Ke2J1Mc=
187+
github.com/sashabaranov/go-openai v1.28.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
188+
github.com/sashabaranov/go-openai v1.32.0 h1:Yk3iE9moX3RBXxrof3OBtUBrE7qZR0zF9ebsoO4zVzI=
189+
github.com/sashabaranov/go-openai v1.32.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
186190
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
187191
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
188192
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=

backend/main.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ var opts struct {
2626
MongoURI string `short:"m" long:"mongo_uri" env:"MONGO_URI" required:"true" description:"MongoDB connection string"`
2727
MongoDelay time.Duration `long:"mongo-delay" env:"MONGO_DELAY" default:"0" description:"mongo initial delay"`
2828
MongoDB string `long:"mongo-db" env:"MONGO_DB" default:"ureadability" description:"mongo database name"`
29+
OpenAIKey string `long:"openai_key" env:"OPENAI_KEY" description:"OpenAI API key for summary generation"`
2930
Debug bool `long:"dbg" env:"DEBUG" description:"debug mode"`
3031
}
3132

@@ -41,7 +42,7 @@ func main() {
4142
log.Fatalf("[ERROR] can't connect to mongo %v", err)
4243
}
4344
srv := rest.Server{
44-
Readability: extractor.UReadability{TimeOut: 30, SnippetSize: 300, Rules: db.GetStores()},
45+
Readability: extractor.UReadability{TimeOut: 30, SnippetSize: 300, Rules: db.GetStores(), OpenAIKey: opts.OpenAIKey},
4546
Token: opts.Token,
4647
Credentials: opts.Credentials,
4748
Version: revision,

0 commit comments

Comments
 (0)