diff --git a/infra/reverse_proxy/docker-compose.yml b/infra/reverse_proxy/docker-compose.yml new file mode 100644 index 0000000..d2ebb51 --- /dev/null +++ b/infra/reverse_proxy/docker-compose.yml @@ -0,0 +1,8 @@ +version: '2' +services: + app: + image: nginx + volumes: + - ./reverse.conf:/etc/nginx/conf.d/default.conf + ports: + - "8080:80" diff --git a/infra/reverse_proxy/reverse.conf b/infra/reverse_proxy/reverse.conf new file mode 100644 index 0000000..750086a --- /dev/null +++ b/infra/reverse_proxy/reverse.conf @@ -0,0 +1,50 @@ +server { + listen 80; + listen [::]:80; + server_name localhost; + + #charset koi8-r; + #access_log /var/log/nginx/host.access.log main; + +# location / { +# root /usr/share/nginx/html; +# index index.html index.htm; +# } + + #error_page 404 /404.html; + + # redirect server error pages to the static page /50x.html + # + error_page 500 502 503 504 /50x.html; + location = /50x.html { + root /usr/share/nginx/html; + } + + location / { + proxy_pass https://portaldjbr.contraloria.gov.py/; + } + + # proxy the PHP scripts to Apache listening on 127.0.0.1:80 + # + #location ~ \.php$ { + # proxy_pass http://127.0.0.1; + #} + + # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000 + # + #location ~ \.php$ { + # root html; + # fastcgi_pass 127.0.0.1:9000; + # fastcgi_index index.php; + # fastcgi_param SCRIPT_FILENAME /scripts$fastcgi_script_name; + # include fastcgi_params; + #} + + # deny access to .htaccess files, if Apache's document root + # concurs with nginx's one + # + #location ~ /\.ht { + # deny all; + #} +} + diff --git a/scripts/python/airflow/dags/contralory_declaration_download_pdfs.py b/scripts/python/airflow/dags/contralory_declaration_download_pdfs.py index 08ab9d4..905edd3 100644 --- a/scripts/python/airflow/dags/contralory_declaration_download_pdfs.py +++ b/scripts/python/airflow/dags/contralory_declaration_download_pdfs.py @@ -38,6 +38,7 @@ dag_job_target_dir = os.path.join(Variable.get("CGR_PDF_FOLDER", os.path.join(os.sep, "tmp", "contralory", "raw"))) dag_sub_jobs_count = int(Variable.get("CGR_DOWNLOAD_PDF_SUB_JOBS_COUNT", 10)) +dag_base_url = Variable.get("CGR_DOWNLOAD_PDF_BASE_URL", "https://portaldjbr.contraloria.gov.py/portal-djbr/api/consulta/descargarpdf/") default_args = { "owner": "airflow", @@ -47,7 +48,6 @@ "email_on_retry": False, "retry_delay": timedelta(hours=1), "params": { - "url": "https://portaldjbr.contraloria.gov.py/portal-djbr/api/consulta/descargarpdf/", "target_dir": dag_job_target_dir }, } @@ -237,7 +237,7 @@ def do_work(number: int, url: str, target_dir: str, mod_of: int): python_callable=do_work, op_kwargs={ "number": digit, - "url": "{{ params.url }}", + "url": dag_base_url, "target_dir": "{{ params.target_dir }}", "mod_of": dag_sub_jobs_count }, diff --git a/scripts/python/airflow/dags/contralory_declaration_link_fetcher.py b/scripts/python/airflow/dags/contralory_declaration_link_fetcher.py index 12b3f8d..35f9de9 100644 --- a/scripts/python/airflow/dags/contralory_declaration_link_fetcher.py +++ b/scripts/python/airflow/dags/contralory_declaration_link_fetcher.py @@ -4,11 +4,13 @@ import requests from re import findall as re_findall from airflow.hooks.postgres_hook import PostgresHook -from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.postgres_operator import PostgresOperator from airflow.operators.python_operator import PythonOperator from airflow.utils.dates import days_ago +from airflow.models import DAG, Variable + +dag_base_url = Variable.get("CGR_FETCHER_BASE_URL", "https://portaldjbr.contraloria.gov.py/portal-djbr/api/consulta/declaraciones/paginadas") default_args = { "owner": "airflow", @@ -18,7 +20,6 @@ "email_on_retry": False, "retry_delay": timedelta(hours=1), "params": { - "url": "https://portaldjbr.contraloria.gov.py/portal-djbr/api/consulta/declaraciones/paginadas", }, } @@ -82,7 +83,6 @@ def list_navigator(base_query: str, url: str): yield records should_continue = len(records) != 0 page += 1 - # should_continue = False def get_upsert_query() -> str: @@ -145,7 +145,6 @@ def fetch_list(letter: str, url: str, **kwargs): ''') for letter in ['a', 'e', 'i', 'o', 'u']: - # for letter in ['a']: # Get list from webpage get_pdf_list = PythonOperator( task_id=f"""get_pdf_list_{letter}""", @@ -153,7 +152,7 @@ def fetch_list(letter: str, url: str, **kwargs): python_callable=fetch_list, op_kwargs={ "letter": letter, - "url": "{{ params.url }}", + "url": dag_base_url, }, )