diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 6199f77..3bfa57f 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -18,6 +18,7 @@ jobs: AWS_REGION: ${{ secrets.AWS_REGION }} GITHUB_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} LAUNCH_DARKLY_KEY: ${{ secrets.LAUNCH_DARKLY_KEY_DEV }} + DB_HOST: 127.0.0.1 # Will not work with 'localhost', since that will try a Unix socket connection (!) services: elasticsearch7: image: docker.elastic.co/elasticsearch/elasticsearch:7.10.0 @@ -30,6 +31,16 @@ jobs: http.cors.allow-origin: "*" ports: - 9200:9200 + db: + image: mysql:8.0 + env: + MYSQL_DATABASE: "rorapi" + MYSQL_USER: "ror_user" + MYSQL_PASSWORD: "password" + MYSQL_ROOT_PASSWORD: "password" + ports: + - 3306:3306 + options: --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=3 steps: - name: Checkout ror-api code uses: actions/checkout@v2 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index bfd0346..fede267 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -9,6 +9,7 @@ jobs: ELASTIC_PASSWORD: "changeme" ELASTIC7_HOST: "localhost" ELASTIC7_PORT: "9200" + DB_HOST: 127.0.0.1 AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.AWS_REGION }} @@ -26,6 +27,16 @@ jobs: http.cors.allow-origin: "*" ports: - 9200:9200 + db: + image: mysql:8.0 + env: + MYSQL_DATABASE: "rorapi" + MYSQL_USER: "ror_user" + MYSQL_PASSWORD: "password" + MYSQL_ROOT_PASSWORD: "password" + ports: + - 3306:3306 + options: --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=3 steps: - name: Checkout ror-api code uses: actions/checkout@v2 diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 57bef2f..42907f1 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -10,6 +10,7 @@ jobs: ELASTIC_PASSWORD: "changeme" ELASTIC7_HOST: "localhost" ELASTIC7_PORT: "9200" + DB_HOST: 127.0.0.1 AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.AWS_REGION }} @@ -27,6 +28,16 @@ jobs: http.cors.allow-origin: "*" ports: - 9200:9200 + db: + image: mysql:8.0 + env: + MYSQL_DATABASE: "rorapi" + MYSQL_USER: "ror_user" + MYSQL_PASSWORD: "password" + MYSQL_ROOT_PASSWORD: "password" + ports: + - 3306:3306 + options: --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=3 steps: - name: Checkout ror-api code uses: actions/checkout@v2 diff --git a/Dockerfile b/Dockerfile index 9ebc84e..09b18d6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,7 +16,7 @@ RUN mv /etc/apt/sources.list.d /etc/apt/sources.list.d.bak && \ mv /etc/apt/sources.list.d.bak /etc/apt/sources.list.d && \ apt-get upgrade -y -o Dpkg::Options::="--force-confold" && \ apt-get clean && \ - apt-get install ntp wget unzip tzdata python3-pip libmagic1 -y && \ + apt-get install ntp wget unzip tzdata python3-pip libmagic1 default-libmysqlclient-dev libcairo2-dev pkg-config -y && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* # Enable Passenger and Nginx and remove the default site @@ -54,6 +54,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt RUN pip3 install yapf # collect static files for Django +ENV DJANGO_SKIP_DB_CHECK=True RUN python manage.py collectstatic --noinput # Expose web diff --git a/README.md b/README.md index 953426d..dd58935 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,9 @@ Commands for indexing ROR data, generating new ROR IDs and other internal operat ROUTE_USER=[USER] TOKEN=[TOKEN] -Replace values in [] with valid credential values. GITHUB_TOKEN is needed in order to index an existing data dump locally. ROUTE_USER and TOKEN are only needed in order to use generate-id functionality locally. AWS_* and DATA_STORE are only needed in order to use incremental indexing from S3 functionality locally. +ROR staff should replace values in [] with valid credential values. External users do not need to add these values but should comment out this line https://github.com/ror-community/ror-api/blob/8a5a5ae8b483564c966a7184349c581dcae756ef/rorapi/management/commands/setup.py#L13 so that there is no attempt to send a Github token when retrieving a data dump for indexing. + +- Optionally, uncomment [line 24 in docker-compose.yml](https://github.com/ror-community/ror-api/blob/master/docker-compose.yml#L24) in order to pull the rorapi image from Dockerhub rather than creating it from local code ## Start ror-api locally 1. Start Docker Desktop diff --git a/docker-compose.yml b/docker-compose.yml index 0cd1b0b..253e3c3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,6 +19,18 @@ services: timeout: 1s volumes: - ./esdata:/usr/share/elasticsearch/data + db: + image: mysql:8.0 + volumes: + - mysql_data:/var/lib/mysql + env_file: + - .env + ports: + - "3306:3306" + healthcheck: + test: ["CMD", "mysqladmin", "ping", "-h", "localhost"] + timeout: 20s + retries: 10 web: container_name: rorapiweb env_file: .env @@ -31,3 +43,6 @@ services: - ./rorapi:/home/app/webapp/rorapi depends_on: - elasticsearch7 + - db +volumes: + mysql_data: diff --git a/requirements.txt b/requirements.txt index f28d73b..a44a9b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,4 +24,8 @@ update_address @ git+https://github.com/ror-community/update_address.git launchdarkly-server-sdk==7.6.1 jsonschema==3.2.0 python-magic -iso639-lang \ No newline at end of file +iso639-lang +mysqlclient==2.2.7 +bleach==6.0.0 +pycountry==22.3.5 +django-ses==3.5.0 \ No newline at end of file diff --git a/rorapi/common/urls.py b/rorapi/common/urls.py index b9aa57c..277e0bd 100644 --- a/rorapi/common/urls.py +++ b/rorapi/common/urls.py @@ -3,7 +3,7 @@ from rest_framework.documentation import include_docs_urls from . import views from rorapi.common.views import ( - HeartbeatView,GenerateAddress,GenerateId,IndexData,IndexDataDump,BulkUpdate) + HeartbeatView,GenerateAddress,GenerateId,IndexData,IndexDataDump,BulkUpdate,ClientRegistrationView,ValidateClientView) urlpatterns = [ # Health check @@ -14,6 +14,8 @@ path('generateaddress/', GenerateAddress.as_view()), url(r"^generateid$", GenerateId.as_view()), re_path(r"^(?P(v1|v2))\/bulkupdate$", BulkUpdate.as_view()), + re_path(r"^(?P(v1|v2))\/register$", ClientRegistrationView.as_view()), + path('validate-client-id//', ValidateClientView.as_view()), url(r"^(?P(v1|v2))\/indexdata/(?P.*)", IndexData.as_view()), url(r"^(?P(v1|v2))\/indexdatadump\/(?Pv(\d+\.)?(\d+\.)?(\*|\d+)-\d{4}-\d{2}-\d{2}-ror-data)\/(?P(test|prod))$", IndexDataDump.as_view()), url(r"^(?P(v1|v2))\/", include(views.organizations_router.urls)), diff --git a/rorapi/common/views.py b/rorapi/common/views.py index ebcfa21..554e22b 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -37,10 +37,87 @@ import os import update_address as ua from rorapi.management.commands.generaterorid import check_ror_id -from rorapi.management.commands.generaterorid import check_ror_id from rorapi.management.commands.indexror import process_files from django.core import management import rorapi.management.commands.indexrordump +from django.core.mail import EmailMultiAlternatives +from django.utils.timezone import now +from rorapi.v2.models import Client +from rorapi.v2.serializers import ClientSerializer + +class ClientRegistrationView(APIView): + def post(self, request, version='v2'): + serializer = ClientSerializer(data=request.data) + if serializer.is_valid(): + client = serializer.save() + + subject = 'ROR API client ID' + from_email = "ROR API Support " + recipient_list = [client.email] + + html_content = self._get_html_content(client.client_id) + text_content = self._get_text_content(client.client_id) + + msg = EmailMultiAlternatives(subject, text_content, from_email, recipient_list) + msg.attach_alternative(html_content, "text/html") + msg.send() + + return Response({'client_id': client.client_id}, status=status.HTTP_201_CREATED) + + return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) + + def _get_text_content(self, client_id): + return f""" + Thank you for registering for a ROR API client ID! + + Your ROR API client ID is: + {client_id} + + This client ID is not used for authentication or authorization, and is therefore not secret and can be sent as plain text. + + In order to receive a rate limit of 2000 requests per 5 minute period, please include this client ID with your ROR API requests, in a custom HTTP header named Client-Id, for example: + + curl -H "Client-Id: {client_id}" https://api.ror.org/organizations?query=oxford + + Requests without a valid client ID are subject to a rate limit of 50 requests per 5 minute period. + + We do not provide a way to recover or revoke a lost client ID. If you lose track of your client ID, please register a new client ID. For more information about ROR API client IDs, see https://ror.readme.io/docs/client-id + + If you have questions, please see ROR documentation or contact us at support@ror.org + + Cheers, + The ROR Team + support@ror.org + https://ror.org + """ + + + def _get_html_content(self, client_id): + return f""" +
+

Thank you for registering for a ROR API client ID!

+

Your ROR API client ID is:

+
{client_id}
+

This client ID is not used for authentication or authorization, and is therefore not secret and can be sent as plain text.

+

In order to receive a rate limit of 2000 requests per 5 minute period, please include this client ID with your ROR API requests, in a custom HTTP header named Client-Id, for example:

+
curl -H "Client-Id: {client_id}" https://api.ror.org/organizations?query=oxford
+

Requests without a valid client ID are subject to a rate limit of 50 requests per 5 minute period.

+

We do not provide a way to recover or revoke a lost client ID. If you lose track of your client ID, please register a new one.

+

For more information about ROR API client IDs, see our documentation.

+

If you have questions, please see the ROR documentation or contact us at support@ror.org.

+

Cheers,
+ The ROR Team
+ support@ror.org
+ https://ror.org

+
+ """ + + +class ValidateClientView(APIView): + def get(self, request, client_id): + client_exists = Client.objects.filter(client_id=client_id).exists() + + return Response({'valid': client_exists}, status=status.HTTP_200_OK) class OurTokenPermission(BasePermission): """ diff --git a/rorapi/management/commands/generaterorid.py b/rorapi/management/commands/generaterorid.py index 2ea42a5..af07be0 100644 --- a/rorapi/management/commands/generaterorid.py +++ b/rorapi/management/commands/generaterorid.py @@ -26,3 +26,10 @@ def check_ror_id(version): check_ror_id(version) return ror_id + +def generate_ror_client_id(): + """Generates a random ROR client ID. + """ + + n = random.randint(0, 2**160 - 1) + return base32_crockford.encode(n).lower().zfill(32) diff --git a/rorapi/migrations/0001_create_client_model.py b/rorapi/migrations/0001_create_client_model.py new file mode 100644 index 0000000..b7b558f --- /dev/null +++ b/rorapi/migrations/0001_create_client_model.py @@ -0,0 +1,30 @@ +# Generated by Django 2.2.28 on 2025-03-11 07:13 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='Client', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('email', models.EmailField(max_length=255)), + ('name', models.CharField(blank=True, max_length=255)), + ('institution_name', models.CharField(blank=True, max_length=255)), + ('institution_ror', models.URLField(blank=True, max_length=255)), + ('country_code', models.CharField(blank=True, max_length=2)), + ('ror_use', models.TextField(blank=True, max_length=500)), + ('client_id', models.CharField(editable=False, max_length=32, unique=True)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('last_request_at', models.DateTimeField(blank=True, null=True)), + ('request_count', models.IntegerField(default=0)), + ], + ), + ] diff --git a/rorapi/migrations/0002_auto_20250326_1054.py b/rorapi/migrations/0002_auto_20250326_1054.py new file mode 100644 index 0000000..0d06ed9 --- /dev/null +++ b/rorapi/migrations/0002_auto_20250326_1054.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.28 on 2025-03-26 10:54 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('rorapi', '0001_create_client_model'), + ] + + operations = [ + migrations.AlterField( + model_name='client', + name='email', + field=models.EmailField(max_length=255, unique=True), + ), + ] diff --git a/rorapi/migrations/0003_auto_20250415_1207.py b/rorapi/migrations/0003_auto_20250415_1207.py new file mode 100644 index 0000000..0697d02 --- /dev/null +++ b/rorapi/migrations/0003_auto_20250415_1207.py @@ -0,0 +1,43 @@ +# Generated by Django 2.2.28 on 2025-04-15 12:07 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('rorapi', '0002_auto_20250326_1054'), + ] + + operations = [ + migrations.AlterField( + model_name='client', + name='country_code', + field=models.CharField(blank=True, max_length=2, null=True), + ), + migrations.AlterField( + model_name='client', + name='email', + field=models.EmailField(max_length=255), + ), + migrations.AlterField( + model_name='client', + name='institution_name', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='client', + name='institution_ror', + field=models.URLField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='client', + name='name', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='client', + name='ror_use', + field=models.TextField(blank=True, max_length=500, null=True), + ), + ] diff --git a/rorapi/migrations/__init__.py b/rorapi/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rorapi/settings.py b/rorapi/settings.py index a8f3504..e1a7f8e 100644 --- a/rorapi/settings.py +++ b/rorapi/settings.py @@ -11,6 +11,7 @@ """ import os +import sys import json import sentry_sdk import boto3 @@ -71,7 +72,7 @@ 'django.contrib.auth.middleware.AuthenticationMiddleware', 'django.contrib.messages.middleware.MessageMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware', - 'django_prometheus.middleware.PrometheusAfterMiddleware', + 'django_prometheus.middleware.PrometheusAfterMiddleware' ] ROOT_URLCONF = 'rorapi.common.urls' @@ -105,7 +106,23 @@ # Database # https://docs.djangoproject.com/en/2.2/ref/settings/#databases -DATABASES = {} +if 'collectstatic' in sys.argv and os.environ.get('DJANGO_SKIP_DB_CHECK') == 'True': + DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.dummy' + } + } +else: + DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.mysql', + 'NAME': os.environ.get('DB_NAME', 'rorapi'), + 'USER': os.environ.get('DB_USER', 'root'), + 'PASSWORD': os.environ.get('DB_PASSWORD', 'password'), + 'HOST': os.environ.get('DB_HOST', 'db'), + 'PORT': os.environ.get('DB_PORT', '3306'), + } +} # Password validation # https://docs.djangoproject.com/en/2.2/ref/settings/#auth-password-validators @@ -273,3 +290,13 @@ GRID_REMOVED_IDS = [] LAUNCH_DARKLY_KEY = os.environ.get('LAUNCH_DARKLY_KEY') + +# Toggle for behavior-based rate limiting +import os +ENABLE_BEHAVIORAL_LIMITING = os.getenv("ENABLE_BEHAVIORAL_LIMITING", "False") == "True" + +# Email settings for Django +EMAIL_BACKEND = 'django_ses.SESBackend' +AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID') +AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY') +AWS_SES_REGION_NAME = os.environ.get('AWS_REGION', 'eu-west-1') diff --git a/rorapi/tests/tests_unit/tests_client.py b/rorapi/tests/tests_unit/tests_client.py new file mode 100644 index 0000000..d70ccc9 --- /dev/null +++ b/rorapi/tests/tests_unit/tests_client.py @@ -0,0 +1,7 @@ +from django.test import TestCase +from rorapi.v2.models import Client + +class ClientTests(TestCase): + def test_client_registration(self): + client = Client.objects.create(email='test@example.com') + self.assertIsNotNone(client.client_id) \ No newline at end of file diff --git a/rorapi/v2/models.py b/rorapi/v2/models.py index e9c7d1a..1b5a82f 100644 --- a/rorapi/v2/models.py +++ b/rorapi/v2/models.py @@ -1,4 +1,7 @@ from geonamescache.mappers import country +import random +import string +from django.db import models from rorapi.common.models import TypeBucket, CountryBucket, StatusBucket, Entity from rorapi.v2.record_constants import continent_code_to_name @@ -130,4 +133,40 @@ class MatchingResult: def __init__(self, data): self.number_of_results = len(data) - self.items = [MatchedOrganization(x) for x in data] \ No newline at end of file + self.items = [MatchedOrganization(x) for x in data] + + +class Client(models.Model): + # Required fields + email = models.EmailField(max_length=255) + + # Optional fields + name = models.CharField(max_length=255, blank=True, null=True) + institution_name = models.CharField(max_length=255, blank=True, null=True) + institution_ror = models.URLField(max_length=255, blank=True, null=True) + country_code = models.CharField(max_length=2, blank=True, null=True) + ror_use = models.TextField(max_length=500, blank=True, null=True) + + # System fields + client_id = models.CharField( + max_length=32, + unique=True, + editable=False + ) + created_at = models.DateTimeField(auto_now_add=True) + last_request_at = models.DateTimeField(null=True, blank=True) + request_count = models.IntegerField(default=0) + + def __str__(self): + return f"{self.email} - {self.client_id}" + + @staticmethod + def generate_client_id(): + """Generate a unique 32-character client ID""" + return ''.join(random.choices(string.ascii_uppercase + string.digits, k=32)) + + def save(self, *args, **kwargs): + # Ensure client_id is generated before saving + if not self.client_id: # Only generate if it's empty + self.client_id = self.generate_client_id() + super().save(*args, **kwargs) diff --git a/rorapi/v2/serializers.py b/rorapi/v2/serializers.py index 62327a5..002fd8e 100644 --- a/rorapi/v2/serializers.py +++ b/rorapi/v2/serializers.py @@ -1,4 +1,8 @@ from rest_framework import serializers +import bleach +import pycountry +import re +from rorapi.v2.models import Client from rorapi.common.serializers import BucketSerializer, OrganizationRelationshipsSerializer class AggregationsSerializer(serializers.Serializer): @@ -87,3 +91,73 @@ class MatchedOrganizationSerializer(serializers.Serializer): class MatchingResultSerializer(serializers.Serializer): number_of_results = serializers.IntegerField() items = MatchedOrganizationSerializer(many=True) + + +class ClientSerializer(serializers.ModelSerializer): + class Meta: + model = Client + fields = ['email', 'name', 'institution_name', 'institution_ror', 'country_code', 'ror_use'] + extra_kwargs = { + 'name': {'required': False, 'allow_null': True}, + 'institution_name': {'required': False, 'allow_null': True}, + 'institution_ror': {'required': False, 'allow_null': True}, + 'country_code': {'required': False, 'allow_null': True}, + 'ror_use': {'required': False, 'allow_null': True}, + } + + def validate_email(self, value): + """Validate the email format and ensure it's unique.""" + if value is None: + raise serializers.ValidationError("Email cannot be null.") + return value + + def validate_name(self, value): + """Sanitize name and validate length. Reject empty string.""" + if value is not None: + if value == "": + raise serializers.ValidationError("Name cannot be an empty string.") + value = bleach.clean(value) # Sanitize to strip HTML + if len(value) > 255: + raise serializers.ValidationError("Name cannot be longer than 255 characters.") + return value + + def validate_institution_name(self, value): + """Sanitize institution name and validate length. Reject empty string.""" + if value is not None: + if value == "": + raise serializers.ValidationError("Institution name cannot be an empty string.") + value = bleach.clean(value) # Sanitize to strip HTML + if len(value) > 255: + raise serializers.ValidationError("Institution name cannot be longer than 255 characters.") + return value + + def validate_institution_ror(self, value): + """Validate and format institution ROR to match 'https://ror.org/XXXXX'. Reject empty string.""" + if value is not None: + if value == "": + raise serializers.ValidationError("Institution ROR cannot be an empty string.") + value = bleach.clean(value) # Sanitize to strip HTML + ror_regex = r'https://ror\.org/[A-Za-z0-9]+' + if not re.match(ror_regex, value): + raise serializers.ValidationError("Institution ROR must be in the format 'https://ror.org/XXXXX'.") + return value + + def validate_country_code(self, value): + """Validate that the country code is a valid ISO 3166-1 alpha-2 country code. Reject empty string.""" + if value is not None: + if value == "": + raise serializers.ValidationError("Country code cannot be an empty string.") + value = value.strip().upper() # Normalize to uppercase + if len(value) != 2 or not pycountry.countries.get(alpha_2=value): + raise serializers.ValidationError(f"{value} is not a valid ISO 3166-1 alpha-2 country code.") + return value + + def validate_ror_use(self, value): + """Sanitize ror_use and validate length. Reject empty string.""" + if value is not None: + if value == "": + raise serializers.ValidationError("ROR use cannot be an empty string.") + value = bleach.clean(value) # Sanitize to strip HTML + if len(value) > 500: + raise serializers.ValidationError("ROR use cannot be longer than 500 characters.") + return value diff --git a/rorapi/v2/tests.py b/rorapi/v2/tests.py new file mode 100644 index 0000000..5e2c9ef --- /dev/null +++ b/rorapi/v2/tests.py @@ -0,0 +1,11 @@ +from django.test import TestCase +from .models.client import Client + +class ClientTests(TestCase): + def test_client_registration(self): + client = Client.objects.create(email='test@example.com') + self.assertIsNotNone(client.client_id) + + def test_rate_limiting(self): + response = self.client.get('/client-id/', HTTP_CLIENT_ID="INVALID_ID") + self.assertEqual(response.status_code, 429)