From eb3e58da91b00536f8dada2d4e93c0ee41e84a57 Mon Sep 17 00:00:00 2001 From: Ian Wienand Date: Wed, 1 Jul 2020 05:49:19 +1000 Subject: [PATCH] gitea-image: add a robots.txt This looks like a very sane default robots.txt. We can modify it as required. Change-Id: I8b9d3aa63538388e319f0216535f7a1d977f4885 --- docker/gitea/custom/public/robots.txt | 35 +++++++++++++++++++++++++++ testinfra/test_gitea.py | 6 +++++ 2 files changed, 41 insertions(+) create mode 100644 docker/gitea/custom/public/robots.txt diff --git a/docker/gitea/custom/public/robots.txt b/docker/gitea/custom/public/robots.txt new file mode 100644 index 0000000000..0c7127359f --- /dev/null +++ b/docker/gitea/custom/public/robots.txt @@ -0,0 +1,35 @@ +# This was kindly seeded with a mix of +# https://git.lelux.fi/theel0ja/gitea-robots.txt/src/branch/master/robots.txt +# and +# https://github.com/robots.txt +# at 2020-07-01 +# +# Some commented out items are left to indicate we have considered +# them and would like to explicitly allow them for indexing while they +# are not causing problems. + +User-agent: * + +# Disallow: /avatars +# Disallow: /user/* +# Disallow: /*/*/src/commit/* +# Disallow: /*/*/commit/* + +Disallow: /*/*/activity/* +Disallow: /vendor/librejs.html +Disallow: /api/swagger +Disallow: /swagger.*.json + +# Language spam +Disallow: /*?lang= + +# From github +Disallow: */archive/ +Disallow: */blame/ +# Disallow: /raw/* +Disallow: /.git/ +Disallow: */.git/ +Disallow: /*.git$ +Disallow: /*q= + +Crawl-delay: 2 diff --git a/testinfra/test_gitea.py b/testinfra/test_gitea.py index 2a1e2c8b62..6990b02c6d 100644 --- a/testinfra/test_gitea.py +++ b/testinfra/test_gitea.py @@ -33,3 +33,9 @@ def test_ulimit(host): def test_sshd_logs(host): cmd = host.run("docker logs gitea-docker_gitea-ssh_1") assert cmd.stdout != '' or cmd.stderr != '' + +def test_robots(host): + cmd = host.run('curl --insecure ' + '--resolve gitea99.opendev.org:3000:127.0.0.1 ' + 'https://gitea99.opendev.org:3000/robots.txt') + assert 'Disallow: /' in cmd.stdout