-
Notifications
You must be signed in to change notification settings - Fork 202
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Disallow crawlers from high cardinality collection pages (#4735)
* Add collection routes to robots.txt * Disallow crawlers from following links on pages with results * Remove unused and confusing development deployment env * Only send index meta if site is configured as indexable * Update frontend/src/middleware/collection.ts Co-authored-by: zack <[email protected]> * Comment on rationale of robots rules * Do not set robots rules if the route has an error * Only call useRobotsRule inside page components --------- Co-authored-by: zack <[email protected]>
- Loading branch information
1 parent
acadc1d
commit fb7f8c5
Showing
12 changed files
with
168 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import { useRobotsRule, useSiteConfig } from "#imports" | ||
|
||
/** | ||
* Robots meta tag and header instructions for pages | ||
* These are distinct from robots.txt rules because do not | ||
* want to prevent bots from viewing the pages altogether | ||
* in case they are visiting for e.g., embed information. | ||
* We _do_ want to disallow following links that will cause | ||
* rapid and unwanted crawling behaviour (e.g., related | ||
* results on a single result page, collection results, etc) | ||
* | ||
* Pages not listed here are either covered by the robots.txt | ||
* rules configured in nuxt.config.ts or are allowed to be | ||
* crawled with default settings (index and follow links) | ||
*/ | ||
const pageRobots = { | ||
"single-result": "noindex, nofollow", | ||
"tag-collection": "noindex, nofollow", | ||
"source-collection": "index, nofollow", | ||
"creator-collection": "noindex, nofollow", | ||
} as const | ||
|
||
export const usePageRobotsRule = (page: keyof typeof pageRobots) => { | ||
const siteConfig = useSiteConfig() | ||
if (!siteConfig.indexable) { | ||
useRobotsRule("noindex, nofollow") | ||
} else { | ||
useRobotsRule(pageRobots[page]) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,9 @@ | ||
export const LOCAL = "local" | ||
export const DEVELOPMENT = "development" | ||
export const STAGING = "staging" | ||
export const PRODUCTION = "production" | ||
|
||
// The order of the environments is important. They should be arranged in | ||
// increasing order of code-readiness, from local to production. | ||
export const DEPLOY_ENVS = [LOCAL, DEVELOPMENT, STAGING, PRODUCTION] as const | ||
export const DEPLOY_ENVS = [LOCAL, STAGING, PRODUCTION] as const | ||
|
||
export type DeployEnv = (typeof DEPLOY_ENVS)[number] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
81 changes: 81 additions & 0 deletions
81
frontend/test/playwright/e2e/seo.spec.ts-snapshots/robots-linux.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# START nuxt-robots (indexable) | ||
User-agent: * | ||
Disallow: /search | ||
Disallow: /ar/search | ||
Disallow: /es/search | ||
Disallow: /ru/search | ||
|
||
User-agent: GPTBot | ||
Disallow: / | ||
Disallow: /ar/ | ||
Disallow: /es/ | ||
Disallow: /ru/ | ||
|
||
User-agent: CCBot | ||
Disallow: / | ||
Disallow: /ar/ | ||
Disallow: /es/ | ||
Disallow: /ru/ | ||
|
||
User-agent: ChatGPT-User | ||
Disallow: / | ||
Disallow: /ar/ | ||
Disallow: /es/ | ||
Disallow: /ru/ | ||
|
||
User-agent: Google-Extended | ||
Disallow: / | ||
Disallow: /ar/ | ||
Disallow: /es/ | ||
Disallow: /ru/ | ||
|
||
User-agent: anthropic-ai | ||
Disallow: / | ||
Disallow: /ar/ | ||
Disallow: /es/ | ||
Disallow: /ru/ | ||
|
||
User-agent: Omgilibot | ||
Disallow: / | ||
Disallow: /ar/ | ||
Disallow: /es/ | ||
Disallow: /ru/ | ||
|
||
User-agent: Omgili | ||
Disallow: / | ||
Disallow: /ar/ | ||
Disallow: /es/ | ||
Disallow: /ru/ | ||
|
||
User-agent: FacebookBot | ||
Disallow: / | ||
Disallow: /ar/ | ||
Disallow: /es/ | ||
Disallow: /ru/ | ||
|
||
User-agent: Diffbot | ||
Disallow: / | ||
Disallow: /ar/ | ||
Disallow: /es/ | ||
Disallow: /ru/ | ||
|
||
User-agent: Bytespider | ||
Disallow: / | ||
Disallow: /ar/ | ||
Disallow: /es/ | ||
Disallow: /ru/ | ||
|
||
User-agent: ImagesiftBot | ||
Disallow: / | ||
Disallow: /ar/ | ||
Disallow: /es/ | ||
Disallow: /ru/ | ||
|
||
User-agent: cohere-ai | ||
Disallow: / | ||
Disallow: /ar/ | ||
Disallow: /es/ | ||
Disallow: /ru/ | ||
|
||
Sitemap: http://localhost:8443/sitemap_index.xml | ||
# END nuxt-robots |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters