diff --git a/gleam.toml b/gleam.toml index 7080615..2d9f4ed 100644 --- a/gleam.toml +++ b/gleam.toml @@ -29,6 +29,8 @@ gleamsver = ">= 1.0.1 and < 2.0.0" porter_stemmer = ">= 1.0.0 and < 2.0.0" gleam_time = ">= 1.2.0 and < 2.0.0" gleam_httpc = ">= 5.0.0 and < 6.0.0" +edit_distance = ">= 3.0.0 and < 4.0.0" +cell = ">= 1.0.0 and < 2.0.0" [dev-dependencies] gleeunit = "~> 1.0" diff --git a/manifest.toml b/manifest.toml index e0647c2..d619be5 100644 --- a/manifest.toml +++ b/manifest.toml @@ -3,7 +3,9 @@ packages = [ { name = "argv", version = "1.0.2", build_tools = ["gleam"], requirements = [], otp_app = "argv", source = "hex", outer_checksum = "BA1FF0929525DEBA1CE67256E5ADF77A7CDDFE729E3E3F57A5BDCAA031DED09D" }, + { name = "cell", version = "1.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "cell", source = "hex", outer_checksum = "4200D6FD95E7E720F9376FD4670E7ACC2E6847CF129514D91EB1583218E1C351" }, { name = "directories", version = "1.2.0", build_tools = ["gleam"], requirements = ["envoy", "gleam_stdlib", "platform", "simplifile"], otp_app = "directories", source = "hex", outer_checksum = "D13090CFCDF6759B87217E8DDD73A75903A700148A82C1D33799F333E249BF9E" }, + { name = "edit_distance", version = "3.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "edit_distance", source = "hex", outer_checksum = "7DC465C34695F9E57D79FC65670C53C992CE342BF29E0AA41FF44F61AF62FC56" }, { name = "envoy", version = "1.0.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "envoy", source = "hex", outer_checksum = "95FD059345AA982E89A0B6E2A3BF1CF43E17A7048DCD85B5B65D3B9E4E39D359" }, { name = "exception", version = "2.1.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "exception", source = "hex", outer_checksum = "329D269D5C2A314F7364BD2711372B6F2C58FA6F39981572E5CA68624D291F8C" }, { name = "filepath", version = "1.1.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "filepath", source = "hex", outer_checksum = "B06A9AF0BF10E51401D64B98E4B627F1D2E48C154967DA7AF4D0914780A6D40A" }, @@ -38,6 +40,8 @@ packages = [ [requirements] argv = { version = "~> 1.0" } +cell = { version = ">= 1.0.0 and < 2.0.0" } +edit_distance = { version = ">= 3.0.0 and < 4.0.0" } envoy = { version = ">= 1.0.2 and < 2.0.0" } gleam_erlang = { version = ">= 1.3.0 and < 2.0.0" } gleam_hexpm = { version = ">= 3.0.0 and < 4.0.0" } diff --git a/src/packages/error.gleam b/src/packages/error.gleam index f0a504c..ebf1175 100644 --- a/src/packages/error.gleam +++ b/src/packages/error.gleam @@ -8,5 +8,6 @@ pub type Error { HttpClientError(httpc.HttpError) JsonDecodeError(json.DecodeError) StorageError(storail.StorailError) - EtsTableError + TextIndexEtsTableError + KnownWordsEtsTableError } diff --git a/src/packages/router.gleam b/src/packages/router.gleam index d429cc9..8ec98c2 100644 --- a/src/packages/router.gleam +++ b/src/packages/router.gleam @@ -140,13 +140,23 @@ fn internet_points(ctx: Context) -> Response { fn search(request: Request, context: Context) -> Response { let search_term = get_search_parameter(request) - let assert Ok(packages) = case search_term { - "" -> storage.packages_most_recent_first(context.db) + let assert Ok(search_outcome) = case search_term { + "" -> + storage.packages_most_recent_first(context.db) + |> result.map(storage.Packages) + _ -> storage.search_packages(context.db, context.search_index, search_term) } - page.packages_list(packages, search_term) - |> wisp.html_response(200) + case search_outcome { + storage.Packages(packages:) -> + page.packages_list(packages, search_term) + |> wisp.html_response(200) + + storage.DidYouMean(suggestion:) -> + page.did_you_mean(suggestion, search_term) + |> wisp.html_response(200) + } } fn get_search_parameter(request: Request) -> String { diff --git a/src/packages/storage.gleam b/src/packages/storage.gleam index f0691ee..e8c9cd7 100644 --- a/src/packages/storage.gleam +++ b/src/packages/storage.gleam @@ -370,39 +370,44 @@ pub fn list_packages(database: Database) -> Result(List(String), Error) { } } +pub type SearchOutcome { + Packages(packages: List(Package)) + DidYouMean(suggestion: String) +} + pub fn search_packages( db: Database, search: text_search.TextSearchIndex, search_term: String, -) -> Result(List(Package), Error) { - let bool = fn(b) { - case b { - True -> 1 - False -> 0 - } - } +) -> Result(SearchOutcome, Error) { use found <- result.try(text_search.lookup(search, search_term)) + + case found { + [_, ..] -> + rank_found_results(found, db, search_term) + |> result.map(Packages) + + // If no results are found we try and suggest a fix for the search term. + [] -> + case text_search.did_you_mean(search, search_term) { + Ok(suggestion) -> Ok(DidYouMean(suggestion:)) + Error(_) -> Ok(Packages(packages: [])) + } + } +} + +/// Given a list of `text_search` results, this returns a list of the matching +/// packages, ranked from most relevant to least relevant. +/// +fn rank_found_results( + found: List(text_search.Found), + db: Database, + search_term: String, +) -> Result(List(Package), Error) { use packages <- result.map( list.try_map(found, fn(found) { use package <- result.map(get_package(db, found.name)) - - let exact_package_name_match = bool(search_term == package.name) - let is_not_v0 = bool(!string.starts_with(package.latest_version, "0.")) - let is_core_package = bool(override.is_core_package(package.name)) - let updated_at = - float.round(timestamp.to_unix_seconds(package.updated_in_hex_at)) - - // This is the value we use to determine what order packages should be - // shown by. Later list values only take effect if the earlier ones are - // equal. - let ordering_key = [ - exact_package_name_match, - is_not_v0, - found.match_count, - is_core_package, - package.downloads_recent, - updated_at, - ] + let ordering_key = package_ordering_key(search_term, package, found) #(ordering_key, package) }), ) @@ -412,6 +417,38 @@ pub fn search_packages( |> list.map(fn(pair) { pair.1 }) } +/// This is the value we use to determine what order packages should be shown +/// by. +/// +fn package_ordering_key( + search_term: String, + package: Package, + found: text_search.Found, +) -> List(Int) { + let bool = fn(bool) { + case bool { + True -> 1 + False -> 0 + } + } + + let exact_package_name_match = bool(search_term == package.name) + let is_not_v0 = bool(!string.starts_with(package.latest_version, "0.")) + let is_core_package = bool(override.is_core_package(package.name)) + let updated_at = + float.round(timestamp.to_unix_seconds(package.updated_in_hex_at)) + + // Later list values only take effect if the earlier ones are equal. + [ + exact_package_name_match, + is_not_v0, + found.match_count, + is_core_package, + package.downloads_recent, + updated_at, + ] +} + fn list_compare( a: List(t), b: List(t), diff --git a/src/packages/text_search.gleam b/src/packages/text_search.gleam index 48331bb..abae51c 100644 --- a/src/packages/text_search.gleam +++ b/src/packages/text_search.gleam @@ -1,19 +1,31 @@ +import cell.{type Cell} +import edit_distance import ethos.{type BagTable} -import gleam/dict +import gleam/bool +import gleam/dict.{type Dict} +import gleam/int import gleam/list import gleam/option +import gleam/order.{type Order, Eq, Gt, Lt} import gleam/result +import gleam/set.{type Set} import gleam/string import packages/error.{type Error} import packages/override import porter_stemmer pub opaque type TextSearchIndex { - TextSearchIndex(table: BagTable(String, String)) + TextSearchIndex( + table: BagTable(String, String), + known_words: Cell(Set(String)), + ) } pub fn new() -> TextSearchIndex { - TextSearchIndex(ethos.new()) + let known_words = cell.new(cell.new_table()) + let assert Ok(_) = cell.write(known_words, set.new()) + as "cannot initialise cell" + TextSearchIndex(table: ethos.new(), known_words:) } pub fn insert( @@ -23,18 +35,40 @@ pub fn insert( ) -> Result(Nil, Error) { case override.is_ignored_package(name) { True -> Ok(Nil) - False -> - name - |> string.append(" ") - |> string.append(string.replace(name, "_", " ")) - |> string.append(" ") - |> string.append(description) - |> stem_words - |> list.try_each(fn(word) { ethos.insert(index.table, word, name) }) - |> result.replace_error(error.EtsTableError) + False -> { + let words = split_and_normalise_words(name <> " " <> description) + use _ <- result.try(insert_package(name, words, index)) + use _ <- result.try(update_known_words(words, index)) + Ok(Nil) + } } } +fn insert_package( + name: String, + words: List(String), + index: TextSearchIndex, +) -> Result(Nil, Error) { + words + |> stem_words + |> list.try_each(fn(word) { ethos.insert(index.table, word, name) }) + |> result.replace_error(error.TextIndexEtsTableError) +} + +fn update_known_words( + new_words: List(String), + index: TextSearchIndex, +) -> Result(Nil, Error) { + cell.read(index.known_words) + |> result.try(fn(known_words) { + new_words + |> set.from_list + |> set.union(known_words) + |> cell.write(index.known_words, _) + }) + |> result.replace_error(error.KnownWordsEtsTableError) +} + pub fn update( index: TextSearchIndex, name name: String, @@ -46,24 +80,32 @@ pub fn update( /// Find all matches for the given search term. The list is not returned in any /// order, but each found item is returned with a match count. +/// pub fn lookup( index: TextSearchIndex, phrase: String, ) -> Result(List(Found), Error) { - let phrase = string.lowercase(phrase) - stem_words(phrase) + phrase + |> split_and_normalise_words + |> stem_words |> list.flat_map(override.expand_search_term) |> list.try_map(ethos.get(index.table, _)) + |> result.replace_error(error.TextIndexEtsTableError) |> result.map(fn(names) { names |> list.flatten - |> list.fold(dict.new(), fn(counters, name) { - dict.upsert(counters, name, fn(x) { option.unwrap(x, 0) + 1 }) - }) + |> count_occurrences |> dict.to_list - |> list.map(fn(pair) { Found(pair.0, pair.1) }) + |> list.map(fn(pair) { Found(name: pair.0, match_count: pair.1) }) + }) +} + +fn count_occurrences(list: List(a)) -> Dict(a, Int) { + list.fold(list, dict.new(), fn(counters, name) { + dict.upsert(counters, name, fn(occurrences) { + option.unwrap(occurrences, 0) + 1 + }) }) - |> result.replace_error(error.EtsTableError) } pub type Found { @@ -72,11 +114,17 @@ pub type Found { fn remove(index: TextSearchIndex, name: String) -> Result(Nil, Error) { ethos.delete_value(index.table, name) - |> result.replace_error(error.EtsTableError) + |> result.replace_error(error.TextIndexEtsTableError) } -fn stem_words(phrase: String) -> List(String) { - phrase +fn stem_words(words: List(String)) -> List(String) { + words + |> list.map(porter_stemmer.stem) + |> list.unique +} + +fn split_and_normalise_words(text: String) -> List(String) { + text |> string.lowercase |> string.replace("-", " ") |> string.replace("_", " ") @@ -88,8 +136,6 @@ fn stem_words(phrase: String) -> List(String) { |> string.split(" ") |> list.filter(fn(word) { word != "" }) |> list.map(normalise_spelling) - |> list.map(porter_stemmer.stem) - |> list.unique } fn normalise_spelling(word: String) -> String { @@ -124,3 +170,82 @@ fn normalise_spelling(word: String) -> String { _ -> word } } + +pub fn did_you_mean( + index: TextSearchIndex, + phrase: String, +) -> Result(String, Nil) { + use words <- result.try( + cell.read(index.known_words) + |> result.replace_error(Nil), + ) + + // We want to fix each word in the phrase individually and then join them back + // together to give a suggestion. + let suggestion = + phrase + |> string.lowercase + |> string.split(on: " ") + |> list.map(fn(word) { + // If we find a fix we replace the word with the new suggestion, otherwise + // we leave it unchanged. + closest_word(word, from: words) + |> result.unwrap(word) + }) + |> string.join(with: " ") + + case suggestion == phrase { + False -> Ok(suggestion) + True -> Error(Nil) + } +} + +/// Finds the closest word amongst `words`. If none of the possible words is +/// close enough then this returns `Error(Nil)`. +/// +fn closest_word(to word: String, from words: Set(String)) -> Result(String, Nil) { + // We want to limit the maximum edit distance. Otherwise we could end up + // suggesting fixes that are not related at all to the original query. + let word_length = string.length(word) + let limit = int.max(1, word_length / 3) + + set.fold(words, [], fn(acc, candidate) { + let word_length = string.length(candidate) + let minimum_distance = int.absolute_value(word_length - word_length) + + // If the minimum distance is greater than the allowed limit then we don't + // even waste any time computing the edit distance of the two strings! + use <- bool.guard(when: minimum_distance > limit, return: acc) + let distance = edit_distance.levenshtein(word, candidate) + case distance > limit { + False -> [#(candidate, distance), ..acc] + True -> acc + } + }) + // We only pick the word with the smallest possible edit distance that's below + // the given threshold + |> min(fn(a, b) { int.compare(a.1, b.1) }) + |> result.map(fn(suggestion) { suggestion.0 }) +} + +/// Find the minimum element in a list, this runs in linear time. +/// If there's multiple elements with the same minimum value, the one that is +/// encountered first is returned. +/// +fn min(in list: List(a), by compare: fn(a, a) -> Order) -> Result(a, Nil) { + case list { + [] -> Error(Nil) + [min, ..rest] -> Ok(min_loop(rest, min, compare)) + } +} + +fn min_loop(list: List(a), min_so_far: a, compare: fn(a, a) -> Order) -> a { + case list { + [] -> min_so_far + [first, ..rest] -> + case compare(first, min_so_far) { + Eq | Gt -> min_loop(rest, min_so_far, compare) + Lt -> min_loop(rest, first, compare) + } + } +} diff --git a/src/packages/web/page.gleam b/src/packages/web/page.gleam index 3c5ccb2..3cde6c5 100644 --- a/src/packages/web/page.gleam +++ b/src/packages/web/page.gleam @@ -19,6 +19,25 @@ pub fn packages_list(packages: List(Package), search_term: String) -> String { |> layout } +pub fn did_you_mean(suggestion: String, search_term: String) -> String { + html.div([attribute.class("content")], { + [ + html.header([class("page-header")], [ + text("I couldn't find any package matching your search."), + ]), + search_form(search_term), + html.p([attribute.class("package-list-message")], [ + element.text("Did you mean "), + html.a([attribute.href("?search=" <> suggestion)], [ + element.text(suggestion), + ]), + element.text("?"), + ]), + ] + }) + |> layout +} + pub fn internet_points(stats: storage.InternetPoints) -> String { let count_table = fn(rows) { let rows = diff --git a/test/packages/text_search_test.gleam b/test/packages/text_search_test.gleam index 77d301a..dcca909 100644 --- a/test/packages/text_search_test.gleam +++ b/test/packages/text_search_test.gleam @@ -150,3 +150,48 @@ pub fn underscores_test() { let assert Ok(value) = text_search.lookup(index, "lustre_dev") assert value == [Found("lustre", 1), Found("lustre_dev_tools", 2)] } + +pub fn suggesting_fix_for_typo_from_package_name_test() { + let index = text_search.new() + let assert Ok(_) = + text_search.insert(index, "splitter", "a package to write parsers") + + let assert Ok(value) = text_search.did_you_mean(index, "spliter") + assert value == "splitter" +} + +pub fn suggesting_fix_for_typo_is_case_insensitive_test() { + let index = text_search.new() + let assert Ok(_) = + text_search.insert(index, "splitter", "a package to write parsers") + + let assert Ok(value) = text_search.did_you_mean(index, "SPLITER") + assert value == "splitter" +} + +pub fn suggesting_fix_for_typo_from_package_description_test() { + let index = text_search.new() + let assert Ok(_) = + text_search.insert(index, "splitter", "a package to write parsers") + + let assert Ok(value) = text_search.did_you_mean(index, "pasers") + assert value == "parsers" +} + +pub fn fix_for_typo_only_suggested_when_close_enough_test() { + let index = text_search.new() + let assert Ok(_) = text_search.insert(index, "lustre", "") + + // "Lustrous" is too far from "lustre", so that's not used as a possible fix + // for the typo + assert Error(Nil) == text_search.did_you_mean(index, "lustrous") +} + +pub fn fix_for_typo_can_fix_multiple_words_at_once_test() { + let index = text_search.new() + let assert Ok(_) = + text_search.insert(index, "pokemon_diamond_and_pearl", "time and space") + + let assert Ok(value) = text_search.did_you_mean(index, "tme und spice") + assert value == "time and space" +}