Skip to content

Commit 739644f

Browse files
paskalclaude
andcommitted
Add article summary feature with OpenAI integration
Implement MongoDB cache for summaries to reduce API costs and improve performance. Rename parameters from OpenAI-specific to more generic API names. Support configurable model selection through ModelType enum or direct model names. Add comprehensive tests with mocks for summary generation and caching. Update documentation in README with summary feature details. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]>
1 parent 8c9d615 commit 739644f

File tree

12 files changed

+699
-37
lines changed

12 files changed

+699
-37
lines changed

README.md

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
| address | UKEEPER_ADDRESS | all interfaces | web server listening address |
1212
| port | UKEEPER_PORT | `8080` | web server port |
1313
| mongo_uri | MONGO_URI | none | MongoDB connection string, _required_ |
14-
| openai_key | OPENAI_KEY | none | OpenAI API key for summary generation |
14+
| api_key | API_KEY | none | OpenAI API key for summary generation |
15+
| model_type | MODEL_TYPE | `mini` | OpenAI model for summary generation (mini, default, large, or direct model name like gpt-4o) |
1516
| frontend_dir | FRONTEND_DIR | `/srv/web` | directory with frontend files |
1617
| token | TOKEN | none | token for /content/v1/parser endpoint auth |
1718
| mongo-delay | MONGO_DELAY | `0` | mongo initial delay |
@@ -24,6 +25,31 @@
2425
GET /api/content/v1/parser?token=secret&summary=true&url=http://aa.com/blah - extract content (emulate Readability API parse call), summary is optional and requires OpenAI key and token to be enabled
2526
POST /api/v1/extract {url: http://aa.com/blah} - extract content
2627

28+
### Article Summary Feature
29+
30+
The application can generate concise summaries of article content using OpenAI's GPT models:
31+
32+
1. **Configuration**:
33+
- Set `openai_key` to your OpenAI API key
34+
- Optionally set `openai_model` to specify which model to use:
35+
- Simplified options: `mini` (default, uses gpt-4o-mini), `default` (uses gpt-4o), `large` (uses gpt-4o)
36+
- Direct specification: Any valid OpenAI model name (e.g., `gpt-4o`, `gpt-4o-mini`, etc.)
37+
- A server token must be configured for security reasons
38+
39+
2. **Usage**:
40+
- Add `summary=true` parameter to the `/api/content/v1/parser` endpoint
41+
- Example: `/api/content/v1/parser?token=secret&summary=true&url=http://example.com/article`
42+
43+
3. **Features**:
44+
- Summaries are cached in MongoDB to reduce API costs and improve performance
45+
- The cache stores:
46+
- Content hash (to identify articles)
47+
- Summary text
48+
- Model used for generation
49+
- Creation and update timestamps
50+
- If the same content is requested again, the cached summary is returned
51+
- The preview page automatically shows summaries when available
52+
2753
## Development
2854

2955
### Running tests

backend/datastore/mongo.go

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,29 @@ func New(connectionURI, dbName string, delay time.Duration) (*MongoServer, error
3737
return &MongoServer{client: client, dbName: dbName}, nil
3838
}
3939

40+
// Stores contains all DAO instances
41+
type Stores struct {
42+
Rules RulesDAO
43+
Summaries SummariesDAO
44+
}
45+
4046
// GetStores initialize collections and make indexes
41-
func (m *MongoServer) GetStores() (rules RulesDAO) {
47+
func (m *MongoServer) GetStores() Stores {
4248
rIndexes := []mongo.IndexModel{
4349
{Keys: bson.D{{Key: "enabled", Value: 1}, {Key: "domain", Value: 1}}},
4450
{Keys: bson.D{{Key: "user", Value: 1}, {Key: "domain", Value: 1}, {Key: "enabled", Value: 1}}},
4551
{Keys: bson.D{{Key: "domain", Value: 1}, {Key: "match_urls", Value: 1}}},
4652
}
47-
rules = RulesDAO{Collection: m.collection("rules", rIndexes)}
48-
return rules
53+
54+
sIndexes := []mongo.IndexModel{
55+
{Keys: bson.D{{Key: "created_at", Value: 1}}},
56+
{Keys: bson.D{{Key: "model", Value: 1}}},
57+
}
58+
59+
return Stores{
60+
Rules: RulesDAO{Collection: m.collection("rules", rIndexes)},
61+
Summaries: SummariesDAO{Collection: m.collection("summaries", sIndexes)},
62+
}
4963
}
5064

5165
// collection makes collection with indexes

backend/datastore/rules_test.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ func TestRules(t *testing.T) {
1919
server, err := New("mongodb://localhost:27017/", "test_ureadability", 0)
2020
require.NoError(t, err)
2121
assert.NotNil(t, server.client)
22-
rules := server.GetStores()
22+
stores := server.GetStores()
23+
assert.NotNil(t, stores)
24+
rules := stores.Rules
2325
assert.NotNil(t, rules)
2426
rule := Rule{
2527
Domain: randStringBytesRmndr(42) + ".com",
@@ -74,7 +76,9 @@ func TestRulesCanceledContext(t *testing.T) {
7476
server, err := New("mongodb://wrong", "", 0)
7577
require.NoError(t, err)
7678
assert.NotNil(t, server.client)
77-
rules := server.GetStores()
79+
stores := server.GetStores()
80+
assert.NotNil(t, stores)
81+
rules := stores.Rules
7882
assert.NotNil(t, rules)
7983

8084
ctx, cancel := context.WithCancel(context.Background())

backend/datastore/summaries.go

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
// Package datastore provides mongo implementation for store to keep and access summaries
2+
package datastore
3+
4+
import (
5+
"context"
6+
"crypto/sha256"
7+
"encoding/hex"
8+
"fmt"
9+
"time"
10+
11+
log "github.com/go-pkgz/lgr"
12+
"go.mongodb.org/mongo-driver/bson"
13+
"go.mongodb.org/mongo-driver/mongo"
14+
"go.mongodb.org/mongo-driver/mongo/options"
15+
)
16+
17+
// Summary contains information about a cached summary
18+
type Summary struct {
19+
ID string `bson:"_id"` // SHA256 hash of the content
20+
Content string `bson:"content"` // Original content that was summarized (could be truncated for storage efficiency)
21+
Summary string `bson:"summary"` // Generated summary
22+
Model string `bson:"model"` // OpenAI model used for summarization
23+
CreatedAt time.Time `bson:"created_at"`
24+
UpdatedAt time.Time `bson:"updated_at"`
25+
}
26+
27+
// SummariesDAO handles database operations for article summaries
28+
type SummariesDAO struct {
29+
Collection *mongo.Collection
30+
}
31+
32+
// Get returns summary by content hash
33+
func (s SummariesDAO) Get(ctx context.Context, content string) (Summary, bool) {
34+
contentHash := generateContentHash(content)
35+
res := s.Collection.FindOne(ctx, bson.M{"_id": contentHash})
36+
if res.Err() != nil {
37+
if res.Err() == mongo.ErrNoDocuments {
38+
return Summary{}, false
39+
}
40+
log.Printf("[WARN] can't get summary for hash %s: %v", contentHash, res.Err())
41+
return Summary{}, false
42+
}
43+
44+
summary := Summary{}
45+
if err := res.Decode(&summary); err != nil {
46+
log.Printf("[WARN] can't decode summary document for hash %s: %v", contentHash, err)
47+
return Summary{}, false
48+
}
49+
50+
return summary, true
51+
}
52+
53+
// Save creates or updates summary in the database
54+
func (s SummariesDAO) Save(ctx context.Context, summary Summary) error {
55+
if summary.ID == "" {
56+
summary.ID = generateContentHash(summary.Content)
57+
}
58+
59+
if summary.CreatedAt.IsZero() {
60+
summary.CreatedAt = time.Now()
61+
}
62+
summary.UpdatedAt = time.Now()
63+
64+
opts := options.Update().SetUpsert(true)
65+
_, err := s.Collection.UpdateOne(
66+
ctx,
67+
bson.M{"_id": summary.ID},
68+
bson.M{"$set": summary},
69+
opts,
70+
)
71+
if err != nil {
72+
return fmt.Errorf("failed to save summary: %w", err)
73+
}
74+
return nil
75+
}
76+
77+
// Delete removes summary from the database
78+
func (s SummariesDAO) Delete(ctx context.Context, contentHash string) error {
79+
_, err := s.Collection.DeleteOne(ctx, bson.M{"_id": contentHash})
80+
if err != nil {
81+
return fmt.Errorf("failed to delete summary: %w", err)
82+
}
83+
return nil
84+
}
85+
86+
// generateContentHash creates a hash for the content to use as an ID
87+
func generateContentHash(content string) string {
88+
hash := sha256.Sum256([]byte(content))
89+
return hex.EncodeToString(hash[:])
90+
}

backend/datastore/summaries_test.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
package datastore
2+
3+
import (
4+
"context"
5+
"os"
6+
"testing"
7+
"time"
8+
9+
"github.com/stretchr/testify/assert"
10+
"github.com/stretchr/testify/require"
11+
)
12+
13+
func TestSummariesDAO_SaveAndGet(t *testing.T) {
14+
if _, ok := os.LookupEnv("ENABLE_MONGO_TESTS"); !ok {
15+
t.Skip("ENABLE_MONGO_TESTS env variable is not set")
16+
}
17+
18+
mdb, err := New("mongodb://localhost:27017", "test_ureadability", 0)
19+
require.NoError(t, err)
20+
21+
// Create a unique collection for this test to avoid conflicts
22+
collection := mdb.client.Database(mdb.dbName).Collection("summaries_test")
23+
defer func() {
24+
_ = collection.Drop(context.Background())
25+
}()
26+
27+
dao := SummariesDAO{Collection: collection}
28+
29+
content := "This is a test article content. It should generate a unique hash."
30+
summary := Summary{
31+
Content: content,
32+
Summary: "This is a test summary of the article.",
33+
Model: "gpt-4o-mini",
34+
CreatedAt: time.Now(),
35+
}
36+
37+
// Test saving a summary
38+
err = dao.Save(context.Background(), summary)
39+
require.NoError(t, err)
40+
41+
// Test getting the summary
42+
foundSummary, found := dao.Get(context.Background(), content)
43+
assert.True(t, found)
44+
assert.Equal(t, summary.Summary, foundSummary.Summary)
45+
assert.Equal(t, summary.Model, foundSummary.Model)
46+
assert.NotEmpty(t, foundSummary.ID)
47+
48+
// Test getting a non-existent summary
49+
_, found = dao.Get(context.Background(), "non-existent content")
50+
assert.False(t, found)
51+
52+
// Test updating an existing summary
53+
updatedSummary := Summary{
54+
ID: foundSummary.ID,
55+
Content: content,
56+
Summary: "This is an updated summary.",
57+
Model: "gpt-4o-mini",
58+
CreatedAt: foundSummary.CreatedAt,
59+
}
60+
61+
err = dao.Save(context.Background(), updatedSummary)
62+
require.NoError(t, err)
63+
64+
foundSummary, found = dao.Get(context.Background(), content)
65+
assert.True(t, found)
66+
assert.Equal(t, "This is an updated summary.", foundSummary.Summary)
67+
assert.Equal(t, updatedSummary.CreatedAt, foundSummary.CreatedAt)
68+
assert.NotEqual(t, updatedSummary.UpdatedAt, foundSummary.UpdatedAt) // UpdatedAt should be set by the DAO
69+
70+
// Test deleting a summary
71+
err = dao.Delete(context.Background(), foundSummary.ID)
72+
require.NoError(t, err)
73+
74+
_, found = dao.Get(context.Background(), content)
75+
assert.False(t, found)
76+
}
77+
78+
func TestGenerateContentHash(t *testing.T) {
79+
content1 := "This is a test content."
80+
content2 := "This is a different test content."
81+
82+
hash1 := generateContentHash(content1)
83+
hash2 := generateContentHash(content2)
84+
85+
assert.NotEqual(t, hash1, hash2)
86+
assert.Equal(t, hash1, generateContentHash(content1)) // Same content should produce same hash
87+
assert.Equal(t, 64, len(hash1)) // SHA-256 produces 64 character hex string
88+
}

backend/extractor/pics_test.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"os"
1010
"strings"
1111
"testing"
12+
"time"
1213

1314
"github.com/PuerkitoBio/goquery"
1415
"github.com/stretchr/testify/assert"
@@ -27,7 +28,7 @@ func TestExtractPics(t *testing.T) {
2728
defer ts.Close()
2829

2930
t.Log("test main pic")
30-
lr := UReadability{TimeOut: 30, SnippetSize: 200}
31+
lr := UReadability{TimeOut: time.Second * 30, SnippetSize: 200}
3132
a, err := lr.Extract(context.Background(), ts.URL+"/2015/09/25/poiezdka-s-apple-maps/")
3233
require.NoError(t, err)
3334
allImages := []string{
@@ -41,7 +42,7 @@ func TestExtractPics(t *testing.T) {
4142

4243
func TestExtractPicsDirectly(t *testing.T) {
4344
t.Log("test pic directly")
44-
lr := UReadability{TimeOut: 30, SnippetSize: 200}
45+
lr := UReadability{TimeOut: 30 * time.Second, SnippetSize: 200}
4546
t.Run("normal image retrieval", func(t *testing.T) {
4647
data := `<body>
4748
<img class="alignright size-full wp-image-944214 lazyloadableImage lazyLoad-fadeIn" alt="View Page Source" width="308" height="508" data-original="https://cdn1.tnwcdn.com/wp-content/blogs.dir/1/files/2016/01/page-source.jpg" src="https://cdn1.tnwcdn.com/wp-content/blogs.dir/1/files/2016/01/page-source.jpg"></body>`

0 commit comments

Comments
 (0)