From 799e6a6b753f0b882640d41bfadd735070d538d5 Mon Sep 17 00:00:00 2001 From: Mykola Fant Date: Mon, 23 Jun 2025 14:57:43 +0300 Subject: [PATCH 01/60] add ids --- preview-src/ui-model.yml | 6 +++--- src/partials/article.hbs | 12 ++++++++---- src/partials/body-home.hbs | 4 ++-- src/partials/body-tutorials-landing.hbs | 5 ++++- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/preview-src/ui-model.yml b/preview-src/ui-model.yml index 74f45d53..f18b1121 100644 --- a/preview-src/ui-model.yml +++ b/preview-src/ui-model.yml @@ -7,9 +7,9 @@ site: keys: # googleAnalytics: 'GTM-MQ59XD6' # aiSearchId: 'ad664bf0-07e2-42e7-9150-2e1b04b15cca' -# docsearchId: 'QK2EAH8GB0' -# docsearchApi: 'ef7bd9485eafbd75d6e8425949eda1f5' -# docsearchIndex: 'prod_hazelcast_docs' + docsearchId: 'FMY8D84KMI' + docsearchApi: 'e0a570340810ad57e7359c2004463654' + docsearchIndex: '_test_test_algolia_crawler_articles' components: - name: abc title: Hazelcast IMDG diff --git a/src/partials/article.hbs b/src/partials/article.hbs index 2f1b3095..c6e3493b 100644 --- a/src/partials/article.hbs +++ b/src/partials/article.hbs @@ -1,6 +1,6 @@
{{#if (eq page.layout '404')}} -

{{{or page.title 'Page Not Found'}}}

+

{{{or page.title 'Page Not Found'}}}

The page you’re looking for does not exist. It may have been moved.

@@ -30,7 +30,9 @@ {{/if}} {{#if (not-eq page.layout 'swagger')}} {{#with page.title}} -

+ {{{page.contents}}} + {{#if page.attributes.include-tutorial-list}} + {{/if}} {{/if}} {{> pagination}} diff --git a/src/partials/body-home.hbs b/src/partials/body-home.hbs index ef42ee68..228f8c75 100644 --- a/src/partials/body-home.hbs +++ b/src/partials/body-home.hbs @@ -1,6 +1,6 @@
-

+

Hazelcast Documentation

@@ -142,7 +142,7 @@

- +
team diff --git a/src/partials/body-tutorials-landing.hbs b/src/partials/body-tutorials-landing.hbs index 20ad9353..ef2b7b71 100644 --- a/src/partials/body-tutorials-landing.hbs +++ b/src/partials/body-tutorials-landing.hbs @@ -1,6 +1,9 @@
-

+

Tutorials & Recipes

From c4f2dd64442cea530fe5f8e40d796c604c3a7963 Mon Sep 17 00:00:00 2001 From: Mykola Fant Date: Wed, 2 Jul 2025 17:50:41 +0300 Subject: [PATCH 02/60] add default minutes value --- .eslintrc | 3 +- crawler.js | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 crawler.js diff --git a/.eslintrc b/.eslintrc index fc504896..ed545bf1 100644 --- a/.eslintrc +++ b/.eslintrc @@ -10,5 +10,6 @@ }], "max-len": [1, 120, 2], "spaced-comment": "off" - } + }, + "ignorePatterns": ["crawler.js"] } diff --git a/crawler.js b/crawler.js new file mode 100644 index 00000000..b1c0ff55 --- /dev/null +++ b/crawler.js @@ -0,0 +1,132 @@ +new Crawler({ + appId: "FMY8D84KMI", + indexPrefix: "_test_", + rateLimit: 8, + maxUrls: 30, + schedule: "on the first day of the month", + startUrls: [], + sitemaps: ["https://docs.hazelcast.com/sitemap.xml"], + saveBackup: false, + ignoreQueryParams: [], + actions: [ + { + indexName: "algolia_crawler_articles_for_instant_search", + pathsToMatch: [ + "https://docs.hazelcast.com/management-center/5.8/getting-started/**", + ], + recordExtractor: ({ url, $ }) => { + const RECORD_TYPE = { + TITLE: 0, + CONTENT: 1, + DESCRIPTION: 2, + SECTION_TITLE: 3, + SUBSECTION_TITLE: 4, + OPENAPI: 5, + }; + const isImdg = /\/imdg\//.test(url.pathname); + const title = $(".doc h1").text().trim(); + const version = $("#navbarProductVersion").text().trim(); + const product = $("#navbarProductName").text().trim(); + const records = []; + + const recordBase = { + title: title, + version: version, + product: product, + pageRank: isImdg ? "-1000" : "0", + }; + + // main title + records.push({ + ...recordBase, + type: RECORD_TYPE.TITLE, + url: url, + }); + + // description, aka #preamble + records.push({ + ...recordBase, + type: RECORD_TYPE.DESCRIPTION, + content: $("#preamble").text().trim(), + url: url, + }); + + $(".sect1").each((_i, sectionEl) => { + const sectionTitle = $(sectionEl).find("h2").text().trim(); + const sectionAnchor = $(sectionEl).find("h2>a.anchor").attr("href"); + // section title + records.push({ + ...recordBase, + type: RECORD_TYPE.SECTION_TITLE, + section: sectionTitle, + url: `${url}${sectionAnchor}`, + }); + // add immediate paragraphs + $(sectionEl) + .find(".sectionbody>.paragraph") + .each((_i, contentEl) => { + const content = $(contentEl).text().trim(); + // filter out 'empty' elements like images + if (content) { + records.push({ + ...recordBase, + section: sectionTitle, + subsection: "subsection", // .sect2 + type: RECORD_TYPE.CONTENT, + content: content, + url: `${url}${sectionAnchor}`, + }); + } + }); + + // add paragraphs from subsections + $(sectionEl) + .find(".sect2") + .each((_i, subsectionEl) => { + const subsectionTitle = $(subsectionEl).find("h3").text().trim(); + const subsectionAnchor = $(subsectionEl) + .find("h3>a.anchor") + .attr("href"); + // subsection title + records.push({ + ...recordBase, + type: RECORD_TYPE.SUBSECTION_TITLE, + section: sectionTitle, + subsection: subsectionTitle, + url: `${url}${subsectionAnchor}`, + }); + // add immediate paragraphs + $(subsectionEl) + .find(".paragraph") + .each((_i, contentEl) => { + const content = $(contentEl).text().trim(); + // filter out 'empty' elements like images + if (content) { + records.push({ + ...recordBase, + section: sectionTitle, + subsection: subsectionTitle, + type: RECORD_TYPE.CONTENT, + content: content, + url: `${url}${subsectionAnchor}`, + }); + } + }); + }); + }); + + return records; + }, + }, + ], + initialIndexSettings: { + algolia_crawler_articles_for_instant_search: { + distinct: true, + attributeForDistinct: "url", + searchableAttributes: ["unordered(title)", "unordered(content)", "url"], + customRanking: ["asc(depth)"], + attributesForFaceting: ["product", "version"], + }, + }, + apiKey: "5747ec3f93ee66da2d518af3eb239294", +}); \ No newline at end of file From 828ecea9d8891f81c48b688605f0d34cfdf68f8f Mon Sep 17 00:00:00 2001 From: Mykola Fant Date: Thu, 3 Jul 2025 17:39:22 +0300 Subject: [PATCH 03/60] add crawler.js --- crawler.js | 134 +++++++++++++++++++++++------------------------------ 1 file changed, 58 insertions(+), 76 deletions(-) diff --git a/crawler.js b/crawler.js index b1c0ff55..31ac95c5 100644 --- a/crawler.js +++ b/crawler.js @@ -12,16 +12,62 @@ new Crawler({ { indexName: "algolia_crawler_articles_for_instant_search", pathsToMatch: [ - "https://docs.hazelcast.com/management-center/5.8/getting-started/**", + "https://docs.hazelcast.com/hazelcast/5.5/clients/**", ], recordExtractor: ({ url, $ }) => { + const createRecord = (type, sections, content, recordUrl) => { + const record = { + ...recordBase, + type: type, + sections: sections, + url: recordUrl, + }; + if (content) { + record.content = content; + } + return record; + }; + + const processParagraphs = (containerEl, sections, baseUrl) => { + const paragraphRecords = []; + containerEl.each((_i, contentEl) => { + const content = $(contentEl).text().trim(); + if (content) { + paragraphRecords.push(createRecord(RECORD_TYPE.CONTENT, sections, content, baseUrl)); + } + }); + return paragraphRecords; + }; + + const processSubsections = (sectionEl, sectionTitle) => { + const subsectionRecords = []; + $(sectionEl).find(".sect2").each((_i, subsectionEl) => { + const subsectionTitle = $(subsectionEl).find("h3").text().trim(); + const subsectionAnchor = $(subsectionEl).find("h3>a.anchor").attr("href"); + const subsectionUrl = `${url}${subsectionAnchor}`; + const subsectionSections = [sectionTitle, subsectionTitle]; + + subsectionRecords.push(createRecord(RECORD_TYPE.SECTION_TITLE, subsectionSections, null, subsectionUrl)); + subsectionRecords.push(...processParagraphs($(subsectionEl).find(".paragraph"), subsectionSections, subsectionUrl)); + + $(subsectionEl).find(".sect3").each((_i, subsubsectionEl) => { + const subsubsectionTitle = $(subsubsectionEl).find("h4").text().trim(); + const subsubsectionAnchor = $(subsubsectionEl).find("h4>a.anchor").attr("href"); + const subsubsectionUrl = `${url}${subsubsectionAnchor}`; + const subsubsectionSections = [sectionTitle, subsectionTitle, subsubsectionTitle]; + + subsectionRecords.push(createRecord(RECORD_TYPE.SECTION_TITLE, subsubsectionSections, null, subsubsectionUrl)); + subsectionRecords.push(...processParagraphs($(subsubsectionEl).find(".paragraph"), subsubsectionSections, subsubsectionUrl)); + }); + }); + return subsectionRecords; + }; const RECORD_TYPE = { TITLE: 0, CONTENT: 1, DESCRIPTION: 2, SECTION_TITLE: 3, - SUBSECTION_TITLE: 4, - OPENAPI: 5, + OPENAPI: 4, }; const isImdg = /\/imdg\//.test(url.pathname); const title = $(".doc h1").text().trim(); @@ -31,88 +77,24 @@ new Crawler({ const recordBase = { title: title, + sections: [], version: version, product: product, pageRank: isImdg ? "-1000" : "0", }; - // main title - records.push({ - ...recordBase, - type: RECORD_TYPE.TITLE, - url: url, - }); - - // description, aka #preamble - records.push({ - ...recordBase, - type: RECORD_TYPE.DESCRIPTION, - content: $("#preamble").text().trim(), - url: url, - }); + records.push(createRecord(RECORD_TYPE.TITLE, [], null, url)); + records.push(createRecord(RECORD_TYPE.DESCRIPTION, [], $("#preamble").text().trim(), url)); $(".sect1").each((_i, sectionEl) => { const sectionTitle = $(sectionEl).find("h2").text().trim(); const sectionAnchor = $(sectionEl).find("h2>a.anchor").attr("href"); - // section title - records.push({ - ...recordBase, - type: RECORD_TYPE.SECTION_TITLE, - section: sectionTitle, - url: `${url}${sectionAnchor}`, - }); - // add immediate paragraphs - $(sectionEl) - .find(".sectionbody>.paragraph") - .each((_i, contentEl) => { - const content = $(contentEl).text().trim(); - // filter out 'empty' elements like images - if (content) { - records.push({ - ...recordBase, - section: sectionTitle, - subsection: "subsection", // .sect2 - type: RECORD_TYPE.CONTENT, - content: content, - url: `${url}${sectionAnchor}`, - }); - } - }); - - // add paragraphs from subsections - $(sectionEl) - .find(".sect2") - .each((_i, subsectionEl) => { - const subsectionTitle = $(subsectionEl).find("h3").text().trim(); - const subsectionAnchor = $(subsectionEl) - .find("h3>a.anchor") - .attr("href"); - // subsection title - records.push({ - ...recordBase, - type: RECORD_TYPE.SUBSECTION_TITLE, - section: sectionTitle, - subsection: subsectionTitle, - url: `${url}${subsectionAnchor}`, - }); - // add immediate paragraphs - $(subsectionEl) - .find(".paragraph") - .each((_i, contentEl) => { - const content = $(contentEl).text().trim(); - // filter out 'empty' elements like images - if (content) { - records.push({ - ...recordBase, - section: sectionTitle, - subsection: subsectionTitle, - type: RECORD_TYPE.CONTENT, - content: content, - url: `${url}${subsectionAnchor}`, - }); - } - }); - }); + const sectionUrl = `${url}${sectionAnchor}`; + const sectionSections = [sectionTitle]; + + records.push(createRecord(RECORD_TYPE.SECTION_TITLE, sectionSections, null, sectionUrl)); + records.push(...processParagraphs($(sectionEl).find(".sectionbody>.paragraph"), sectionSections, sectionUrl)); + records.push(...processSubsections(sectionEl, sectionTitle)); }); return records; From 16fdc45a2111e74854018ed86d817655570a41d0 Mon Sep 17 00:00:00 2001 From: Mykola Fant Date: Thu, 3 Jul 2025 18:11:36 +0300 Subject: [PATCH 04/60] add crawler.js --- crawler.js | 121 +++++++++++++++++++----- src/partials/article.hbs | 6 +- src/partials/body-home.hbs | 2 +- src/partials/body-tutorials-landing.hbs | 2 +- src/partials/breadcrumbs.hbs | 2 +- 5 files changed, 101 insertions(+), 32 deletions(-) diff --git a/crawler.js b/crawler.js index 31ac95c5..3e22d732 100644 --- a/crawler.js +++ b/crawler.js @@ -11,15 +11,21 @@ new Crawler({ actions: [ { indexName: "algolia_crawler_articles_for_instant_search", - pathsToMatch: [ - "https://docs.hazelcast.com/hazelcast/5.5/clients/**", - ], + pathsToMatch: ["https://docs.hazelcast.com/hazelcast/5.5/clients/**"], recordExtractor: ({ url, $ }) => { + const getBreadcrumbs = () => { + return $(".breadcrumbs") + .find("li") + .map(function () { + return $(this).text().trim(); + }).toArray(); + }; const createRecord = (type, sections, content, recordUrl) => { const record = { ...recordBase, type: type, sections: sections, + breadcrumbs: getBreadcrumbs(), url: recordUrl, }; if (content) { @@ -33,7 +39,9 @@ new Crawler({ containerEl.each((_i, contentEl) => { const content = $(contentEl).text().trim(); if (content) { - paragraphRecords.push(createRecord(RECORD_TYPE.CONTENT, sections, content, baseUrl)); + paragraphRecords.push( + createRecord(RECORD_TYPE.CONTENT, sections, content, baseUrl), + ); } }); return paragraphRecords; @@ -41,25 +49,66 @@ new Crawler({ const processSubsections = (sectionEl, sectionTitle) => { const subsectionRecords = []; - $(sectionEl).find(".sect2").each((_i, subsectionEl) => { - const subsectionTitle = $(subsectionEl).find("h3").text().trim(); - const subsectionAnchor = $(subsectionEl).find("h3>a.anchor").attr("href"); - const subsectionUrl = `${url}${subsectionAnchor}`; - const subsectionSections = [sectionTitle, subsectionTitle]; - - subsectionRecords.push(createRecord(RECORD_TYPE.SECTION_TITLE, subsectionSections, null, subsectionUrl)); - subsectionRecords.push(...processParagraphs($(subsectionEl).find(".paragraph"), subsectionSections, subsectionUrl)); - - $(subsectionEl).find(".sect3").each((_i, subsubsectionEl) => { - const subsubsectionTitle = $(subsubsectionEl).find("h4").text().trim(); - const subsubsectionAnchor = $(subsubsectionEl).find("h4>a.anchor").attr("href"); - const subsubsectionUrl = `${url}${subsubsectionAnchor}`; - const subsubsectionSections = [sectionTitle, subsectionTitle, subsubsectionTitle]; - - subsectionRecords.push(createRecord(RECORD_TYPE.SECTION_TITLE, subsubsectionSections, null, subsubsectionUrl)); - subsectionRecords.push(...processParagraphs($(subsubsectionEl).find(".paragraph"), subsubsectionSections, subsubsectionUrl)); + $(sectionEl) + .find(".sect2") + .each((_i, subsectionEl) => { + const subsectionTitle = $(subsectionEl).find("h3").text().trim(); + const subsectionAnchor = $(subsectionEl) + .find("h3>a.anchor") + .attr("href"); + const subsectionUrl = `${url}${subsectionAnchor}`; + const subsectionSections = [sectionTitle, subsectionTitle]; + + subsectionRecords.push( + createRecord( + RECORD_TYPE.SECTION_TITLE, + subsectionSections, + null, + subsectionUrl, + ), + ); + subsectionRecords.push( + ...processParagraphs( + $(subsectionEl).find(".paragraph"), + subsectionSections, + subsectionUrl, + ), + ); + + $(subsectionEl) + .find(".sect3") + .each((_i, subsubsectionEl) => { + const subsubsectionTitle = $(subsubsectionEl) + .find("h4") + .text() + .trim(); + const subsubsectionAnchor = $(subsubsectionEl) + .find("h4>a.anchor") + .attr("href"); + const subsubsectionUrl = `${url}${subsubsectionAnchor}`; + const subsubsectionSections = [ + sectionTitle, + subsectionTitle, + subsubsectionTitle, + ]; + + subsectionRecords.push( + createRecord( + RECORD_TYPE.SECTION_TITLE, + subsubsectionSections, + null, + subsubsectionUrl, + ), + ); + subsectionRecords.push( + ...processParagraphs( + $(subsubsectionEl).find(".paragraph"), + subsubsectionSections, + subsubsectionUrl, + ), + ); + }); }); - }); return subsectionRecords; }; const RECORD_TYPE = { @@ -84,16 +133,36 @@ new Crawler({ }; records.push(createRecord(RECORD_TYPE.TITLE, [], null, url)); - records.push(createRecord(RECORD_TYPE.DESCRIPTION, [], $("#preamble").text().trim(), url)); + records.push( + createRecord( + RECORD_TYPE.DESCRIPTION, + [], + $("#preamble").text().trim(), + url, + ), + ); $(".sect1").each((_i, sectionEl) => { const sectionTitle = $(sectionEl).find("h2").text().trim(); const sectionAnchor = $(sectionEl).find("h2>a.anchor").attr("href"); const sectionUrl = `${url}${sectionAnchor}`; const sectionSections = [sectionTitle]; - - records.push(createRecord(RECORD_TYPE.SECTION_TITLE, sectionSections, null, sectionUrl)); - records.push(...processParagraphs($(sectionEl).find(".sectionbody>.paragraph"), sectionSections, sectionUrl)); + + records.push( + createRecord( + RECORD_TYPE.SECTION_TITLE, + sectionSections, + null, + sectionUrl, + ), + ); + records.push( + ...processParagraphs( + $(sectionEl).find(".sectionbody>.paragraph"), + sectionSections, + sectionUrl, + ), + ); records.push(...processSubsections(sectionEl, sectionTitle)); }); diff --git a/src/partials/article.hbs b/src/partials/article.hbs index c6e3493b..4b3e9f72 100644 --- a/src/partials/article.hbs +++ b/src/partials/article.hbs @@ -1,6 +1,6 @@

{{#if (eq page.layout '404')}} -

{{{or page.title 'Page Not Found'}}}

+

{{{or page.title 'Page Not Found'}}}

The page you’re looking for does not exist. It may have been moved.

@@ -31,7 +31,7 @@ {{#if (not-eq page.layout 'swagger')}} {{#with page.title}}

+
{{{page.contents}}}
{{#if page.attributes.include-tutorial-list}} diff --git a/src/partials/body-home.hbs b/src/partials/body-home.hbs index 228f8c75..1767bfa5 100644 --- a/src/partials/body-home.hbs +++ b/src/partials/body-home.hbs @@ -1,6 +1,6 @@
-

+

Hazelcast Documentation

diff --git a/src/partials/body-tutorials-landing.hbs b/src/partials/body-tutorials-landing.hbs index ef2b7b71..8e7aade0 100644 --- a/src/partials/body-tutorials-landing.hbs +++ b/src/partials/body-tutorials-landing.hbs @@ -1,7 +1,7 @@

Tutorials & Recipes diff --git a/src/partials/breadcrumbs.hbs b/src/partials/breadcrumbs.hbs index 6ab29213..9bbce5f4 100644 --- a/src/partials/breadcrumbs.hbs +++ b/src/partials/breadcrumbs.hbs @@ -1,4 +1,4 @@ -