Skip to content

Commit 31c4d2e

Browse files
Copilotkevin-lyn
andauthored
Add robots.txt to optimize AI crawler indexing for MLflow documentation (#386)
Co-authored-by: copilot-swe-agent[bot] <[email protected]> Co-authored-by: kevin-lyn <[email protected]>
1 parent 920fa2a commit 31c4d2e

File tree

2 files changed

+97
-0
lines changed

2 files changed

+97
-0
lines changed

website/static/robots.txt

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Robots.txt for MLflow Documentation
2+
# Optimized for AI crawlers to prioritize latest documentation
3+
4+
# Default rules for all crawlers
5+
User-agent: *
6+
# Allow latest documentation
7+
Allow: /docs/latest/
8+
# Disallow all legacy documentation versions
9+
Disallow: /docs/1.*/
10+
Disallow: /docs/2.*/
11+
Disallow: /docs/0.*/
12+
13+
# Specific rules for AI crawlers
14+
# OpenAI (ChatGPT)
15+
User-agent: ChatGPT-User
16+
User-agent: GPTBot
17+
Allow: /docs/latest/
18+
Disallow: /docs/1.*/
19+
Disallow: /docs/2.*/
20+
Disallow: /docs/0.*/
21+
22+
# Google Gemini
23+
User-agent: Google-Extended
24+
Allow: /docs/latest/
25+
Disallow: /docs/1.*/
26+
Disallow: /docs/2.*/
27+
Disallow: /docs/0.*/
28+
29+
# Anthropic Claude
30+
User-agent: ClaudeBot
31+
User-agent: Claude-Web
32+
Allow: /docs/latest/
33+
Disallow: /docs/1.*/
34+
Disallow: /docs/2.*/
35+
Disallow: /docs/0.*/
36+
37+
# Common Crawl (used by many AI systems)
38+
User-agent: CCBot
39+
Allow: /docs/latest/
40+
Disallow: /docs/1.*/
41+
Disallow: /docs/2.*/
42+
Disallow: /docs/0.*/
43+
44+
# Perplexity
45+
User-agent: PerplexityBot
46+
Allow: /docs/latest/
47+
Disallow: /docs/1.*/
48+
Disallow: /docs/2.*/
49+
Disallow: /docs/0.*/
50+
51+
# Cohere
52+
User-agent: cohere-ai
53+
Allow: /docs/latest/
54+
Disallow: /docs/1.*/
55+
Disallow: /docs/2.*/
56+
Disallow: /docs/0.*/
57+
58+
# Sitemap location
59+
Sitemap: http://mlflow.org/sitemap.xml

website/tests/robots.spec.ts

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import { test, expect } from "@playwright/test";
2+
3+
test.describe("Robots.txt", () => {
4+
test("robots.txt is accessible", async ({ page }) => {
5+
const response = await page.goto("/robots.txt");
6+
expect(response?.status()).toBe(200);
7+
});
8+
9+
test("robots.txt allows latest docs", async ({ page }) => {
10+
const response = await page.goto("/robots.txt");
11+
const content = await response?.text();
12+
expect(content).toContain("Allow: /docs/latest/");
13+
});
14+
15+
test("robots.txt disallows legacy versions", async ({ page }) => {
16+
const response = await page.goto("/robots.txt");
17+
const content = await response?.text();
18+
expect(content).toContain("Disallow: /docs/1.*/");
19+
expect(content).toContain("Disallow: /docs/2.*/");
20+
expect(content).toContain("Disallow: /docs/0.*/");
21+
});
22+
23+
test("robots.txt includes AI crawler configurations", async ({ page }) => {
24+
const response = await page.goto("/robots.txt");
25+
const content = await response?.text();
26+
// Check for various AI crawlers
27+
expect(content).toContain("GPTBot");
28+
expect(content).toContain("ClaudeBot");
29+
expect(content).toContain("Google-Extended");
30+
expect(content).toContain("CCBot");
31+
});
32+
33+
test("robots.txt includes sitemap", async ({ page }) => {
34+
const response = await page.goto("/robots.txt");
35+
const content = await response?.text();
36+
expect(content).toContain("Sitemap:");
37+
});
38+
});

0 commit comments

Comments
 (0)