Skip to content

Commit 7bdf21d

Browse files
committed
Add article summary feature with OpenAI integration
- Introduce 'summary' query parameter in /api/content/v1/parser endpoint - Integrate OpenAI API for generating article summaries - Add OpenAIKey field to Server struct and corresponding command-line flag - Update extractArticleEmulateReadability to handle summary requests - Add generateSummary method using OpenAI's GPT-4o model (turns out to be faster than even 4o mini) - Add OpenAIClient interface and mock for testing - Update README.md with new configuration options and API details This feature allows users to request a summary of extracted articles using OpenAI's GPT-4o model. To ensure secure usage, summary generation requires a valid server token. The changes include comprehensive error handling and test coverage for various scenarios, including token validation and server misconfiguration. # Conflicts: # backend/go.mod # backend/rest/server.go
1 parent 6acfb34 commit 7bdf21d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+6993
-1
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
| port | UKEEPER_PORT | `8080` | web server port |
1313
| mongo-uri | MONGO_URI | none | MongoDB connection string, _required_ |
1414
| frontend-dir | FRONTEND_DIR | `/srv/web` | directory with frontend files |
15+
| openai_key | OPENAI_KEY | none | OpenAI API key for summary generation |
1516
| token | TOKEN | none | token for /content/v1/parser endpoint auth |
1617
| mongo-delay | MONGO_DELAY | `0` | mongo initial delay |
1718
| mongo-db | MONGO_DB | `ureadability` | mongo database name |
@@ -20,7 +21,7 @@
2021

2122
### API
2223

23-
GET /api/content/v1/parser?token=secret&url=http://aa.com/blah - extract content (emulate Readability API parse call)
24+
GET /api/content/v1/parser?token=secret&summary=true&url=http://aa.com/blah - extract content (emulate Readability API parse call), summary is optional and requires OpenAI key and token to be enabled
2425
POST /api/v1/extract {url: http://aa.com/blah} - extract content
2526

2627
## Development

backend/extractor/openai_mock.go

Lines changed: 82 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

backend/extractor/readability.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,17 @@ import (
1414
"github.com/PuerkitoBio/goquery"
1515
log "github.com/go-pkgz/lgr"
1616
"github.com/mauidude/go-readability"
17+
"github.com/sashabaranov/go-openai"
1718
"go.mongodb.org/mongo-driver/bson/primitive"
1819

1920
"github.com/ukeeper/ukeeper-readability/backend/datastore"
2021
)
2122

23+
//go:generate moq -out openai_mock.go . OpenAIClient
24+
type OpenAIClient interface {
25+
CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (openai.ChatCompletionResponse, error)
26+
}
27+
2228
// Rules interface with all methods to access datastore
2329
type Rules interface {
2430
Get(ctx context.Context, rURL string) (datastore.Rule, bool)
@@ -33,10 +39,14 @@ type UReadability struct {
3339
TimeOut time.Duration
3440
SnippetSize int
3541
Rules Rules
42+
OpenAIKey string
43+
44+
openAIClient OpenAIClient
3645
}
3746

3847
// Response from api calls
3948
type Response struct {
49+
Summary string `json:"summary,omitempty"`
4050
Content string `json:"content"`
4151
Rich string `json:"rich_content"`
4252
Domain string `json:"domain"`
@@ -70,6 +80,37 @@ func (f *UReadability) ExtractByRule(ctx context.Context, reqURL string, rule *d
7080
return f.extractWithRules(ctx, reqURL, rule)
7181
}
7282

83+
func (f *UReadability) GenerateSummary(ctx context.Context, content string) (string, error) {
84+
if f.OpenAIKey == "" {
85+
return "", fmt.Errorf("OpenAI key is not set")
86+
}
87+
if f.openAIClient == nil {
88+
f.openAIClient = openai.NewClient(f.OpenAIKey)
89+
}
90+
resp, err := f.openAIClient.CreateChatCompletion(
91+
ctx,
92+
openai.ChatCompletionRequest{
93+
Model: openai.GPT4o,
94+
Messages: []openai.ChatCompletionMessage{
95+
{
96+
Role: openai.ChatMessageRoleSystem,
97+
Content: "You are a helpful assistant that summarizes articles. Please summarize the main points in a few sentences as TLDR style (don't add a TLDR label). Then, list up to five detailed bullet points. Provide the response in plain text. Do not add any additional information. Do not add a Summary at the beginning of the response. If detailed bullet points are too similar to the summary, don't include them at all:",
98+
},
99+
{
100+
Role: openai.ChatMessageRoleUser,
101+
Content: content,
102+
},
103+
},
104+
},
105+
)
106+
107+
if err != nil {
108+
return "", err
109+
}
110+
111+
return resp.Choices[0].Message.Content, nil
112+
}
113+
73114
// ExtractWithRules is the core function that handles extraction with or without a specific rule
74115
func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
75116
log.Printf("[INFO] extract %s", reqURL)

backend/extractor/readability_test.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"testing"
1212
"time"
1313

14+
"github.com/sashabaranov/go-openai"
1415
"github.com/stretchr/testify/assert"
1516
"github.com/stretchr/testify/require"
1617
"go.mongodb.org/mongo-driver/bson/primitive"
@@ -207,3 +208,61 @@ func TestGetContentCustom(t *testing.T) {
207208
assert.Len(t, content, 6988)
208209
assert.Len(t, rich, 7169)
209210
}
211+
212+
func TestUReadability_GenerateSummary(t *testing.T) {
213+
mockOpenAI := &OpenAIClientMock{
214+
CreateChatCompletionFunc: func(ctx context.Context, request openai.ChatCompletionRequest) (openai.ChatCompletionResponse, error) {
215+
return openai.ChatCompletionResponse{
216+
Choices: []openai.ChatCompletionChoice{
217+
{
218+
Message: openai.ChatCompletionMessage{
219+
Content: "This is a summary of the article.",
220+
},
221+
},
222+
},
223+
}, nil
224+
},
225+
}
226+
227+
tests := []struct {
228+
name string
229+
content string
230+
openAIKey string
231+
expectedResult string
232+
expectedError string
233+
}{
234+
{
235+
name: "Valid OpenAI Key and content",
236+
content: "This is a test article content.",
237+
openAIKey: "test-key",
238+
expectedResult: "This is a summary of the article.",
239+
expectedError: "",
240+
},
241+
{
242+
name: "No OpenAI Key",
243+
content: "This is a test article content.",
244+
openAIKey: "",
245+
expectedResult: "",
246+
expectedError: "OpenAI key is not set",
247+
},
248+
}
249+
250+
for _, tt := range tests {
251+
t.Run(tt.name, func(t *testing.T) {
252+
readability := UReadability{
253+
OpenAIKey: tt.openAIKey,
254+
openAIClient: mockOpenAI,
255+
}
256+
257+
result, err := readability.GenerateSummary(context.Background(), tt.content)
258+
259+
if tt.expectedError != "" {
260+
require.Error(t, err)
261+
assert.Contains(t, err.Error(), tt.expectedError)
262+
} else {
263+
require.NoError(t, err)
264+
assert.Equal(t, tt.expectedResult, result)
265+
}
266+
})
267+
}
268+
}

backend/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ require (
1212
github.com/jessevdk/go-flags v1.6.1
1313
github.com/kennygrant/sanitize v1.2.4
1414
github.com/mauidude/go-readability v0.0.0-20220221173116-a9b3620098b7
15+
github.com/sashabaranov/go-openai v1.38.2
1516
github.com/stretchr/testify v1.10.0
1617
go.mongodb.org/mongo-driver v1.17.3
1718
golang.org/x/net v0.38.0

backend/go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,8 @@ github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6So
168168
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
169169
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
170170
github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
171+
github.com/sashabaranov/go-openai v1.38.2 h1:akrssjj+6DY3lWuDwHv6cBvJ8Z+FZDM9XEaaYFt0Auo=
172+
github.com/sashabaranov/go-openai v1.38.2/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
171173
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
172174
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
173175
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=

backend/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ var opts struct {
2727
MongoURI string `short:"m" long:"mongo-uri" env:"MONGO_URI" required:"true" description:"MongoDB connection string"`
2828
MongoDelay time.Duration `long:"mongo-delay" env:"MONGO_DELAY" default:"0" description:"mongo initial delay"`
2929
MongoDB string `long:"mongo-db" env:"MONGO_DB" default:"ureadability" description:"mongo database name"`
30+
OpenAIKey string `long:"openai_key" env:"OPENAI_KEY" description:"OpenAI API key for summary generation"`
3031
Debug bool `long:"dbg" env:"DEBUG" description:"debug mode"`
3132
}
3233

@@ -52,6 +53,7 @@ func main() {
5253
TimeOut: 30 * time.Second,
5354
SnippetSize: 300,
5455
Rules: stores.Rules,
56+
OpenAIKey: opts.OpenAIKey,
5557
},
5658
Token: opts.Token,
5759
Credentials: opts.Credentials,

backend/rest/server.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"net/http"
1010
"os"
1111
"path/filepath"
12+
"strconv"
1213
"strings"
1314
"time"
1415

@@ -176,12 +177,27 @@ func (s *Server) extractArticle(w http.ResponseWriter, r *http.Request) {
176177
// if token is not set for application, it won't be checked
177178
func (s *Server) extractArticleEmulateReadability(w http.ResponseWriter, r *http.Request) {
178179
token := r.URL.Query().Get("token")
180+
summary, _ := strconv.ParseBool(r.URL.Query().Get("summary"))
179181

180182
if s.Token != "" && token == "" {
181183
rest.SendErrorJSON(w, r, log.Default(), http.StatusExpectationFailed, nil, "no token passed")
182184
return
183185
}
184186

187+
// Check if summary is requested but token is not provided, or OpenAI key is not set
188+
if summary {
189+
if s.Readability.OpenAIKey == "" {
190+
render.Status(r, http.StatusBadRequest)
191+
render.JSON(w, r, JSON{"error": "OpenAI key is not set"})
192+
return
193+
}
194+
if s.Token == "" {
195+
render.Status(r, http.StatusBadRequest)
196+
render.JSON(w, r, JSON{"error": "summary generation requires token, but token is not set for the server"})
197+
return
198+
}
199+
}
200+
185201
if s.Token != "" && s.Token != token {
186202
rest.SendErrorJSON(w, r, log.Default(), http.StatusUnauthorized, nil, "wrong token passed")
187203
return
@@ -199,6 +215,16 @@ func (s *Server) extractArticleEmulateReadability(w http.ResponseWriter, r *http
199215
return
200216
}
201217

218+
if summary {
219+
summaryText, err := s.Readability.GenerateSummary(r.Context(), res.Content)
220+
if err != nil {
221+
render.Status(r, http.StatusInternalServerError)
222+
render.JSON(w, r, JSON{"error": fmt.Sprintf("failed to generate summary: %v", err)})
223+
return
224+
}
225+
res.Summary = summaryText
226+
}
227+
202228
rest.RenderJSON(w, &res)
203229
}
204230

@@ -238,6 +264,13 @@ func (s *Server) handlePreview(w http.ResponseWriter, r *http.Request) {
238264
continue
239265
}
240266

267+
if s.Readability.OpenAIKey != "" {
268+
result.Summary, e = s.Readability.GenerateSummary(r.Context(), result.Content)
269+
if e != nil {
270+
log.Printf("[WARN] failed to generate summary for preview of %s: %v", url, e)
271+
}
272+
}
273+
241274
responses = append(responses, *result)
242275
}
243276

@@ -248,6 +281,7 @@ func (s *Server) handlePreview(w http.ResponseWriter, r *http.Request) {
248281
Excerpt string
249282
Rich template.HTML
250283
Content string
284+
Summary template.HTML
251285
}
252286

253287
results := make([]result, 0, len(responses))
@@ -259,6 +293,8 @@ func (s *Server) handlePreview(w http.ResponseWriter, r *http.Request) {
259293
//nolint:gosec // this content is escaped by Extractor, so it's safe to use it as is
260294
Rich: template.HTML(r.Rich),
261295
Content: r.Content,
296+
//nolint: gosec // we do not expect CSS from OpenAI response
297+
Summary: template.HTML(strings.ReplaceAll(r.Summary, "\n", "<br>")),
262298
})
263299
}
264300

0 commit comments

Comments
 (0)