diff --git a/lunatrace/bsl/hasura/migrations/lunatrace/1679066158985_add_ml_tables/down.sql b/lunatrace/bsl/hasura/migrations/lunatrace/1679066158985_add_ml_tables/down.sql new file mode 100644 index 000000000..95bc63f6d --- /dev/null +++ b/lunatrace/bsl/hasura/migrations/lunatrace/1679066158985_add_ml_tables/down.sql @@ -0,0 +1,8 @@ + +DROP TABLE vulnerability.code_snippet; + +-- Add a generated summary to the reference to make it easier for the LLM to choose what to read +ALTER TABLE vulnerability.reference_content DROP COLUMN summary; + +ALTER TABLE package.package DROP COLUMN readme_text; +ALTER TABLE package.package DROP COLUMN use_case_summary; diff --git a/lunatrace/bsl/hasura/migrations/lunatrace/1679066158985_add_ml_tables/up.sql b/lunatrace/bsl/hasura/migrations/lunatrace/1679066158985_add_ml_tables/up.sql new file mode 100644 index 000000000..abcc2d5d8 --- /dev/null +++ b/lunatrace/bsl/hasura/migrations/lunatrace/1679066158985_add_ml_tables/up.sql @@ -0,0 +1,22 @@ + +CREATE TABLE vulnerability.code_snippet +( + id uuid DEFAULT public.gen_random_uuid() NOT NULL PRIMARY KEY, + created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP NOT NULL, + -- Reference may be null because we may have pulled code from a non-web source such as vuln-db + reference_id uuid NULL references vulnerability.reference, + -- Include url since reference might be null but its still nice to be able to point a source like a vuln-db link for non-scraped content + source_url text NOT NULL, + vulnerability uuid NOT NULL references vulnerability.vulnerability, + code text NOT NULL, + score integer NOT NULL, + summary text NOT NULL, + type text NOT NULL, + language text NOT NULL +); + +-- Add a generated summary to the reference to make it easier for the LLM to choose what to read +ALTER TABLE vulnerability.reference_content ADD COLUMN summary text NULL; + +ALTER TABLE package.package ADD COLUMN readme_text text NULL; +ALTER TABLE package.package ADD COLUMN use_case_summary text NULL; diff --git a/lunatrace/bsl/ml/python/scrape_utils/summarize_package_readme.py b/lunatrace/bsl/ml/python/scrape_utils/summarize_package_readme.py index 07411a22c..bb22f6aeb 100644 --- a/lunatrace/bsl/ml/python/scrape_utils/summarize_package_readme.py +++ b/lunatrace/bsl/ml/python/scrape_utils/summarize_package_readme.py @@ -18,6 +18,7 @@ An explanation of the Fastify framework, for instance, would be identical, because the use case of the two libraries is the same. Don't mention the license, the creators name, or anything that isn't relevant to what the library is used for. +If you can't tell because the readme is useless and you're not familiar with the library from prior knowledge, return nothing at all, just empty, no words or explanation. ---- BEGIN README ---- {text} ---- END README ---- diff --git a/lunatrace/bsl/ml/python/scrape_utils/summarize_scraped.py b/lunatrace/bsl/ml/python/scrape_utils/summarize_scraped.py index c8cac493c..c06386df5 100644 --- a/lunatrace/bsl/ml/python/scrape_utils/summarize_scraped.py +++ b/lunatrace/bsl/ml/python/scrape_utils/summarize_scraped.py @@ -88,7 +88,7 @@ def main(args): print(results) def add_subparser(subparsers): - subparser = subparsers.add_parser('summarize-scraped', help="takes any page content and a command to extract some information from it, as desired. Useful when you have a specific question to ask of an advisory") + subparser = subparsers.add_parser('summarize-scraped', help="takes any page content and a command to extract some information from it, as desired. Useful when you have a specific question to ask of an advisory. Not used for advisory ingestion, instead used by the chat-bot in real time.") subparser.add_argument("contents", nargs = 1, type = str, help = "a string of page contents") subparser.add_argument("query", nargs = 1, type = str, help = "query string that the scraper will try to focus on. can be phraised as a question or command, both are fine") diff --git a/lunatrace/gogen/sqlgen/lunatrace/package/model/package.go b/lunatrace/gogen/sqlgen/lunatrace/package/model/package.go index d3fb89e4b..105c4b829 100644 --- a/lunatrace/gogen/sqlgen/lunatrace/package/model/package.go +++ b/lunatrace/gogen/sqlgen/lunatrace/package/model/package.go @@ -24,4 +24,6 @@ type Package struct { LastFailedFetch *time.Time LastSuccessfulFetch *time.Time Internal bool + ReadmeText *string + UseCaseSummary *string } diff --git a/lunatrace/gogen/sqlgen/lunatrace/package/table/package.go b/lunatrace/gogen/sqlgen/lunatrace/package/table/package.go index a1f040429..dda6c1d61 100644 --- a/lunatrace/gogen/sqlgen/lunatrace/package/table/package.go +++ b/lunatrace/gogen/sqlgen/lunatrace/package/table/package.go @@ -26,6 +26,8 @@ type packageTable struct { LastFailedFetch postgres.ColumnTimestampz LastSuccessfulFetch postgres.ColumnTimestampz Internal postgres.ColumnBool + ReadmeText postgres.ColumnString + UseCaseSummary postgres.ColumnString AllColumns postgres.ColumnList MutableColumns postgres.ColumnList @@ -75,8 +77,10 @@ func newPackageTableImpl(schemaName, tableName, alias string) packageTable { LastFailedFetchColumn = postgres.TimestampzColumn("last_failed_fetch") LastSuccessfulFetchColumn = postgres.TimestampzColumn("last_successful_fetch") InternalColumn = postgres.BoolColumn("internal") - allColumns = postgres.ColumnList{IDColumn, PackageManagerColumn, CustomRegistryColumn, NameColumn, DescriptionColumn, UpstreamDataColumn, LastFailedFetchColumn, LastSuccessfulFetchColumn, InternalColumn} - mutableColumns = postgres.ColumnList{PackageManagerColumn, CustomRegistryColumn, NameColumn, DescriptionColumn, UpstreamDataColumn, LastFailedFetchColumn, LastSuccessfulFetchColumn, InternalColumn} + ReadmeTextColumn = postgres.StringColumn("readme_text") + UseCaseSummaryColumn = postgres.StringColumn("use_case_summary") + allColumns = postgres.ColumnList{IDColumn, PackageManagerColumn, CustomRegistryColumn, NameColumn, DescriptionColumn, UpstreamDataColumn, LastFailedFetchColumn, LastSuccessfulFetchColumn, InternalColumn, ReadmeTextColumn, UseCaseSummaryColumn} + mutableColumns = postgres.ColumnList{PackageManagerColumn, CustomRegistryColumn, NameColumn, DescriptionColumn, UpstreamDataColumn, LastFailedFetchColumn, LastSuccessfulFetchColumn, InternalColumn, ReadmeTextColumn, UseCaseSummaryColumn} ) return packageTable{ @@ -92,6 +96,8 @@ func newPackageTableImpl(schemaName, tableName, alias string) packageTable { LastFailedFetch: LastFailedFetchColumn, LastSuccessfulFetch: LastSuccessfulFetchColumn, Internal: InternalColumn, + ReadmeText: ReadmeTextColumn, + UseCaseSummary: UseCaseSummaryColumn, AllColumns: allColumns, MutableColumns: mutableColumns, diff --git a/lunatrace/gogen/sqlgen/lunatrace/vulnerability/model/code_snippet.go b/lunatrace/gogen/sqlgen/lunatrace/vulnerability/model/code_snippet.go new file mode 100644 index 000000000..e0bcb45c4 --- /dev/null +++ b/lunatrace/gogen/sqlgen/lunatrace/vulnerability/model/code_snippet.go @@ -0,0 +1,26 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package model + +import ( + "github.com/google/uuid" + "time" +) + +type CodeSnippet struct { + ID uuid.UUID `sql:"primary_key"` + CreatedAt time.Time + ReferenceID uuid.UUID + SourceURL string + Vulnerability uuid.UUID + Code string + Score int32 + Summary string + Type string + Language string +} diff --git a/lunatrace/gogen/sqlgen/lunatrace/vulnerability/model/reference_content.go b/lunatrace/gogen/sqlgen/lunatrace/vulnerability/model/reference_content.go index 205343f26..aec271ede 100644 --- a/lunatrace/gogen/sqlgen/lunatrace/vulnerability/model/reference_content.go +++ b/lunatrace/gogen/sqlgen/lunatrace/vulnerability/model/reference_content.go @@ -21,4 +21,5 @@ type ReferenceContent struct { ContentType string LastSuccessfulFetch *time.Time ParsedContent *string + Summary *string } diff --git a/lunatrace/gogen/sqlgen/lunatrace/vulnerability/table/code_snippet.go b/lunatrace/gogen/sqlgen/lunatrace/vulnerability/table/code_snippet.go new file mode 100644 index 000000000..423cd4372 --- /dev/null +++ b/lunatrace/gogen/sqlgen/lunatrace/vulnerability/table/code_snippet.go @@ -0,0 +1,102 @@ +// +// Code generated by go-jet DO NOT EDIT. +// +// WARNING: Changes to this file may cause incorrect behavior +// and will be lost if the code is regenerated +// + +package table + +import ( + "github.com/go-jet/jet/v2/postgres" +) + +var CodeSnippet = newCodeSnippetTable("vulnerability", "code_snippet", "") + +type codeSnippetTable struct { + postgres.Table + + //Columns + ID postgres.ColumnString + CreatedAt postgres.ColumnTimestampz + ReferenceID postgres.ColumnString + SourceURL postgres.ColumnString + Vulnerability postgres.ColumnString + Code postgres.ColumnString + Score postgres.ColumnInteger + Summary postgres.ColumnString + Type postgres.ColumnString + Language postgres.ColumnString + + AllColumns postgres.ColumnList + MutableColumns postgres.ColumnList +} + +type CodeSnippetTable struct { + codeSnippetTable + + EXCLUDED codeSnippetTable +} + +// AS creates new CodeSnippetTable with assigned alias +func (a CodeSnippetTable) AS(alias string) *CodeSnippetTable { + return newCodeSnippetTable(a.SchemaName(), a.TableName(), alias) +} + +// Schema creates new CodeSnippetTable with assigned schema name +func (a CodeSnippetTable) FromSchema(schemaName string) *CodeSnippetTable { + return newCodeSnippetTable(schemaName, a.TableName(), a.Alias()) +} + +// WithPrefix creates new CodeSnippetTable with assigned table prefix +func (a CodeSnippetTable) WithPrefix(prefix string) *CodeSnippetTable { + return newCodeSnippetTable(a.SchemaName(), prefix+a.TableName(), a.TableName()) +} + +// WithSuffix creates new CodeSnippetTable with assigned table suffix +func (a CodeSnippetTable) WithSuffix(suffix string) *CodeSnippetTable { + return newCodeSnippetTable(a.SchemaName(), a.TableName()+suffix, a.TableName()) +} + +func newCodeSnippetTable(schemaName, tableName, alias string) *CodeSnippetTable { + return &CodeSnippetTable{ + codeSnippetTable: newCodeSnippetTableImpl(schemaName, tableName, alias), + EXCLUDED: newCodeSnippetTableImpl("", "excluded", ""), + } +} + +func newCodeSnippetTableImpl(schemaName, tableName, alias string) codeSnippetTable { + var ( + IDColumn = postgres.StringColumn("id") + CreatedAtColumn = postgres.TimestampzColumn("created_at") + ReferenceIDColumn = postgres.StringColumn("reference_id") + SourceURLColumn = postgres.StringColumn("source_url") + VulnerabilityColumn = postgres.StringColumn("vulnerability") + CodeColumn = postgres.StringColumn("code") + ScoreColumn = postgres.IntegerColumn("score") + SummaryColumn = postgres.StringColumn("summary") + TypeColumn = postgres.StringColumn("type") + LanguageColumn = postgres.StringColumn("language") + allColumns = postgres.ColumnList{IDColumn, CreatedAtColumn, ReferenceIDColumn, SourceURLColumn, VulnerabilityColumn, CodeColumn, ScoreColumn, SummaryColumn, TypeColumn, LanguageColumn} + mutableColumns = postgres.ColumnList{CreatedAtColumn, ReferenceIDColumn, SourceURLColumn, VulnerabilityColumn, CodeColumn, ScoreColumn, SummaryColumn, TypeColumn, LanguageColumn} + ) + + return codeSnippetTable{ + Table: postgres.NewTable(schemaName, tableName, alias, allColumns...), + + //Columns + ID: IDColumn, + CreatedAt: CreatedAtColumn, + ReferenceID: ReferenceIDColumn, + SourceURL: SourceURLColumn, + Vulnerability: VulnerabilityColumn, + Code: CodeColumn, + Score: ScoreColumn, + Summary: SummaryColumn, + Type: TypeColumn, + Language: LanguageColumn, + + AllColumns: allColumns, + MutableColumns: mutableColumns, + } +} diff --git a/lunatrace/gogen/sqlgen/lunatrace/vulnerability/table/reference_content.go b/lunatrace/gogen/sqlgen/lunatrace/vulnerability/table/reference_content.go index 2bbeeaf17..9bd1b257e 100644 --- a/lunatrace/gogen/sqlgen/lunatrace/vulnerability/table/reference_content.go +++ b/lunatrace/gogen/sqlgen/lunatrace/vulnerability/table/reference_content.go @@ -25,6 +25,7 @@ type referenceContentTable struct { ContentType postgres.ColumnString LastSuccessfulFetch postgres.ColumnTimestampz ParsedContent postgres.ColumnString + Summary postgres.ColumnString AllColumns postgres.ColumnList MutableColumns postgres.ColumnList @@ -73,8 +74,9 @@ func newReferenceContentTableImpl(schemaName, tableName, alias string) reference ContentTypeColumn = postgres.StringColumn("content_type") LastSuccessfulFetchColumn = postgres.TimestampzColumn("last_successful_fetch") ParsedContentColumn = postgres.StringColumn("parsed_content") - allColumns = postgres.ColumnList{IDColumn, ReferenceIDColumn, TitleColumn, ContentColumn, NormalizedContentColumn, ContentTypeColumn, LastSuccessfulFetchColumn, ParsedContentColumn} - mutableColumns = postgres.ColumnList{ReferenceIDColumn, TitleColumn, ContentColumn, NormalizedContentColumn, ContentTypeColumn, LastSuccessfulFetchColumn, ParsedContentColumn} + SummaryColumn = postgres.StringColumn("summary") + allColumns = postgres.ColumnList{IDColumn, ReferenceIDColumn, TitleColumn, ContentColumn, NormalizedContentColumn, ContentTypeColumn, LastSuccessfulFetchColumn, ParsedContentColumn, SummaryColumn} + mutableColumns = postgres.ColumnList{ReferenceIDColumn, TitleColumn, ContentColumn, NormalizedContentColumn, ContentTypeColumn, LastSuccessfulFetchColumn, ParsedContentColumn, SummaryColumn} ) return referenceContentTable{ @@ -89,6 +91,7 @@ func newReferenceContentTableImpl(schemaName, tableName, alias string) reference ContentType: ContentTypeColumn, LastSuccessfulFetch: LastSuccessfulFetchColumn, ParsedContent: ParsedContentColumn, + Summary: SummaryColumn, AllColumns: allColumns, MutableColumns: mutableColumns,