Skip to content

Commit

Permalink
Disallow crawlers from high cardinality collection pages (#4735)
Browse files Browse the repository at this point in the history
* Add collection routes to robots.txt

* Disallow crawlers from following links on pages with results

* Remove unused and confusing development deployment env

* Only send index meta if site is configured as indexable

* Update frontend/src/middleware/collection.ts

Co-authored-by: zack <[email protected]>

* Comment on rationale of robots rules

* Do not set robots rules if the route has an error

* Only call useRobotsRule inside page components

---------

Co-authored-by: zack <[email protected]>
  • Loading branch information
sarayourfriend and zackkrida authored Aug 12, 2024
1 parent acadc1d commit fb7f8c5
Show file tree
Hide file tree
Showing 12 changed files with 168 additions and 13 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ repos:
- id: check-xml
- id: check-yaml
- id: end-of-file-fixer
exclude: frontend/test/tapes/.+\.json5
exclude: (frontend/test/tapes/.+\.json5|frontend/.*snapshots.*)
- id: check-symlinks
- id: mixed-line-ending
- id: fix-encoding-pragma
Expand Down
10 changes: 7 additions & 3 deletions frontend/nuxt.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,18 @@ export default defineNuxtConfig({
},
/**
* Robots.txt rules are configured here via the \@nuxtjs/robots package.
* @see {@link https://nuxtseo.com/robots/guides/nuxt-config|Robots Config Rules}
* @see {@link https://nuxtseo.com/robots/guides/nuxt-config}
*/
robots: {
disallow: ["/search", "/search/audio", "/search/image"],
disallow: [
// robots rules are prefixed-based, so there's no need to configure specific media type searches
"/search",
// Other routes have more complex requirements; we configure those with `useRobotsRule` as needed
],
groups: [
...disallowedBots.map((bot) => ({
userAgent: [bot],
disallow: ["/"], // block bots from all routes
disallow: ["/"], // block disallowed bots from all routes
})),
],
},
Expand Down
30 changes: 30 additions & 0 deletions frontend/src/composables/use-page-robots-rule.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import { useRobotsRule, useSiteConfig } from "#imports"

/**
* Robots meta tag and header instructions for pages
* These are distinct from robots.txt rules because do not
* want to prevent bots from viewing the pages altogether
* in case they are visiting for e.g., embed information.
* We _do_ want to disallow following links that will cause
* rapid and unwanted crawling behaviour (e.g., related
* results on a single result page, collection results, etc)
*
* Pages not listed here are either covered by the robots.txt
* rules configured in nuxt.config.ts or are allowed to be
* crawled with default settings (index and follow links)
*/
const pageRobots = {
"single-result": "noindex, nofollow",
"tag-collection": "noindex, nofollow",
"source-collection": "index, nofollow",
"creator-collection": "noindex, nofollow",
} as const

export const usePageRobotsRule = (page: keyof typeof pageRobots) => {
const siteConfig = useSiteConfig()
if (!siteConfig.indexable) {
useRobotsRule("noindex, nofollow")
} else {
useRobotsRule(pageRobots[page])
}
}
3 changes: 1 addition & 2 deletions frontend/src/constants/deploy-env.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
export const LOCAL = "local"
export const DEVELOPMENT = "development"
export const STAGING = "staging"
export const PRODUCTION = "production"

// The order of the environments is important. They should be arranged in
// increasing order of code-readiness, from local to production.
export const DEPLOY_ENVS = [LOCAL, DEVELOPMENT, STAGING, PRODUCTION] as const
export const DEPLOY_ENVS = [LOCAL, STAGING, PRODUCTION] as const

export type DeployEnv = (typeof DEPLOY_ENVS)[number]
3 changes: 3 additions & 0 deletions frontend/src/middleware/single-result.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,17 @@ export default defineNuxtRouteMiddleware(async (to, from) => {
if (!mediaId) {
return
}

singleResultStore.setMediaById(mediaType, mediaId)

if (import.meta.server) {
await Promise.allSettled([
singleResultStore.fetch(mediaType, mediaId),
relatedMediaStore.fetchMedia(mediaType, mediaId),
])

const fetchingError = singleResultStore.fetchState.fetchingError

if (
!singleResultStore.mediaItem &&
fetchingError &&
Expand Down
4 changes: 4 additions & 0 deletions frontend/src/pages/audio/[id]/index.vue
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ import { useSensitiveMedia } from "~/composables/use-sensitive-media"
import { useSingleResultStore } from "~/stores/media/single-result"
import singleResultMiddleware from "~/middleware/single-result"
import { usePageRobotsRule } from "~/composables/use-page-robots-rule"
import VAudioTrack from "~/components/VAudioTrack/VAudioTrack.vue"
import VMediaReuse from "~/components/VMediaInfo/VMediaReuse.vue"
import VRelatedMedia from "~/components/VMediaInfo/VRelatedMedia.vue"
Expand All @@ -42,6 +44,8 @@ definePageMeta({
middleware: singleResultMiddleware,
})
usePageRobotsRule("single-result")
const singleResultStore = useSingleResultStore()
const route = useRoute()
Expand Down
10 changes: 10 additions & 0 deletions frontend/src/pages/audio/collection.vue
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@ import { collectionMiddleware } from "~/middleware/collection"
import { skipToContentTargetId } from "~/constants/window"
import { useCollection } from "~/composables/use-collection"
import { usePageRobotsRule } from "~/composables/use-page-robots-rule"
import { AUDIO } from "~/constants/media"
import { CollectionParams } from "~/types/search"
import VCollectionResults from "~/components/VSearchResultsGrid/VCollectionResults.vue"
defineOptions({
Expand All @@ -30,10 +33,17 @@ const {
pageTitle,
} = useCollection({ mediaType: AUDIO })
// Collection params are not nullable in the collections route, this is enforced by the middleware
// Question: should this non-nullability be filtered in the type and enforced in runtime by `useCollection`?
usePageRobotsRule(
`${(collectionParams.value as NonNullable<CollectionParams>).collection}-collection`
)
useHead({
meta: [{ hid: "og:title", property: "og:title", content: pageTitle.value }],
title: pageTitle.value,
})
/**
* Media is not empty when we navigate back to this page, so we don't need to fetch
* it again to make sure that all the previously fetched media is displayed.
Expand Down
4 changes: 4 additions & 0 deletions frontend/src/pages/image/[id]/index.vue
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ import { useSingleResultPageMeta } from "~/composables/use-single-result-page-me
import { useSingleResultStore } from "~/stores/media/single-result"
import singleResultMiddleware from "~/middleware/single-result"
import { usePageRobotsRule } from "~/composables/use-page-robots-rule"
import VBone from "~/components/VSkeleton/VBone.vue"
import VMediaReuse from "~/components/VMediaInfo/VMediaReuse.vue"
import VRelatedMedia from "~/components/VMediaInfo/VRelatedMedia.vue"
Expand All @@ -47,6 +49,8 @@ definePageMeta({
middleware: singleResultMiddleware,
})
usePageRobotsRule("single-result")
const singleResultStore = useSingleResultStore()
const nuxtApp = useNuxtApp()
Expand Down
9 changes: 9 additions & 0 deletions frontend/src/pages/image/collection.vue
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ import { skipToContentTargetId } from "~/constants/window"
import { useCollection } from "~/composables/use-collection"
import { IMAGE } from "~/constants/media"
import { usePageRobotsRule } from "~/composables/use-page-robots-rule"
import { CollectionParams } from "~/types/search"
import VCollectionResults from "~/components/VSearchResultsGrid/VCollectionResults.vue"
defineOptions({
Expand Down Expand Up @@ -34,6 +37,12 @@ useHead(() => ({
title: pageTitle.value,
}))
// Collection params are not nullable in the collections route, this is enforced by the middleware
// Question: should this non-nullability be filtered in the type and enforced in runtime by `useCollection`?
usePageRobotsRule(
`${(collectionParams.value as NonNullable<CollectionParams>).collection}-collection`
)
/**
* Media is not empty when we navigate back to this page, so we don't need
* to fetch it again to make sure that all the previously fetched media is displayed.
Expand Down
20 changes: 15 additions & 5 deletions frontend/test/playwright/e2e/seo.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ const DESCRIPTION =
const NO_INDEX = "noindex, nofollow"
const INDEX =
"index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1"
const INDEX_NO_FOLLOW = "index, nofollow"
const DEFAULT_IMAGE = "/openverse-default.jpg"

const pages = {
Expand Down Expand Up @@ -46,7 +47,7 @@ const pages = {
"/v1/images/da5cb478-c093-4d62-b721-cda18797e3fb/thumb/"
),
ogTitle: "bird",
robots: INDEX,
robots: NO_INDEX,
},
audioDetail: {
url: "/audio/7e063ee6-343f-48e4-a4a5-f436393730f6",
Expand All @@ -55,7 +56,7 @@ const pages = {
"/v1/audio/7e063ee6-343f-48e4-a4a5-f436393730f6/thumb/"
),
ogTitle: "I Love My Dog You Love your Cat",
robots: INDEX,
robots: NO_INDEX,
},
about: {
url: "/about",
Expand All @@ -69,21 +70,21 @@ const pages = {
title: "cat images | Openverse",
ogImage: DEFAULT_IMAGE,
ogTitle: "cat images | Openverse",
robots: INDEX,
robots: NO_INDEX,
},
source: {
url: "/image/collection?source=flickr",
title: "Flickr images | Openverse",
ogImage: DEFAULT_IMAGE,
ogTitle: "Flickr images | Openverse",
robots: INDEX,
robots: INDEX_NO_FOLLOW,
},
creator: {
url: "/image/collection?source=flickr&creator=strogoscope",
title: "strogoscope | Openverse",
ogImage: DEFAULT_IMAGE,
ogTitle: "strogoscope | Openverse",
robots: INDEX,
robots: NO_INDEX,
},
}
test.describe("page metadata", () => {
Expand Down Expand Up @@ -112,3 +113,12 @@ test.describe("page metadata", () => {
})
}
})

test.describe("robots.txt", () => {
test("snapshot", async ({ page }) => {
await page.goto("/robots.txt")
const robotsText = await page.innerText("body")

expect(robotsText).toMatchSnapshot({ name: "robots.txt" })
})
})
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# START nuxt-robots (indexable)
User-agent: *
Disallow: /search
Disallow: /ar/search
Disallow: /es/search
Disallow: /ru/search

User-agent: GPTBot
Disallow: /
Disallow: /ar/
Disallow: /es/
Disallow: /ru/

User-agent: CCBot
Disallow: /
Disallow: /ar/
Disallow: /es/
Disallow: /ru/

User-agent: ChatGPT-User
Disallow: /
Disallow: /ar/
Disallow: /es/
Disallow: /ru/

User-agent: Google-Extended
Disallow: /
Disallow: /ar/
Disallow: /es/
Disallow: /ru/

User-agent: anthropic-ai
Disallow: /
Disallow: /ar/
Disallow: /es/
Disallow: /ru/

User-agent: Omgilibot
Disallow: /
Disallow: /ar/
Disallow: /es/
Disallow: /ru/

User-agent: Omgili
Disallow: /
Disallow: /ar/
Disallow: /es/
Disallow: /ru/

User-agent: FacebookBot
Disallow: /
Disallow: /ar/
Disallow: /es/
Disallow: /ru/

User-agent: Diffbot
Disallow: /
Disallow: /ar/
Disallow: /es/
Disallow: /ru/

User-agent: Bytespider
Disallow: /
Disallow: /ar/
Disallow: /es/
Disallow: /ru/

User-agent: ImagesiftBot
Disallow: /
Disallow: /ar/
Disallow: /es/
Disallow: /ru/

User-agent: cohere-ai
Disallow: /
Disallow: /ar/
Disallow: /es/
Disallow: /ru/

Sitemap: http://localhost:8443/sitemap_index.xml
# END nuxt-robots
5 changes: 3 additions & 2 deletions frontend/test/playwright/playwright.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@ const config: PlaywrightTestConfig = {
port: 8443,
reuseExistingServer: !process.env.CI || process.env.PWDEBUG === "1",
env: {
UPDATE_TAPES,
UPDATE_TAPES: UPDATE_TAPES,
NUXT_PUBLIC_API_URL: API_URL,
// Must be true for seo tests to receive appropriate values
NUXT_PUBLIC_SITE_INDEXABLE: "true",
NUXT_PUBLIC_DEPLOYMENT_ENV: STAGING,
NUXT_PUBLIC_PLAUSIBLE_DOMAIN: "localhost",
NUXT_PUBLIC_PLAUSIBLE_API_HOST: "http://localhost:50290",
NUXT_PUBLIC_PLAUSIBLE_AUTO_PAGEVIEWS: "false",
NUXT_PUBLIC_PLAUSIBLE_IGNORED_HOSTNAMES: "[]",
NUXT_PUBLIC_SITE_INDEXABLE: "true",
},
},
use: {
Expand Down

0 comments on commit fb7f8c5

Please sign in to comment.