From f5210d920b33b8c4e70002a9bf1b8520d2224685 Mon Sep 17 00:00:00 2001 From: Giorgos Avramidis Date: Mon, 2 Jun 2025 15:58:36 +0300 Subject: [PATCH] script to dowmnload feks --- .formatter.exs | 4 + .gitignore | 26 +++++++ lib/fek_fetcher.ex | 175 +++++++++++++++++++++++++++++++++++++++++++ lib/fekex.ex | 18 +++++ mix.exs | 28 +++++++ mix.lock | 23 ++++++ scripts/fetch.exs | 13 ++++ test/fekex_test.exs | 8 ++ test/test_helper.exs | 1 + 9 files changed, 296 insertions(+) create mode 100644 .formatter.exs create mode 100644 .gitignore create mode 100644 lib/fek_fetcher.ex create mode 100644 lib/fekex.ex create mode 100644 mix.exs create mode 100644 mix.lock create mode 100644 scripts/fetch.exs create mode 100644 test/fekex_test.exs create mode 100644 test/test_helper.exs diff --git a/.formatter.exs b/.formatter.exs new file mode 100644 index 0000000..d2cda26 --- /dev/null +++ b/.formatter.exs @@ -0,0 +1,4 @@ +# Used by "mix format" +[ + inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] +] diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f08116a --- /dev/null +++ b/.gitignore @@ -0,0 +1,26 @@ +# The directory Mix will write compiled artifacts to. +/_build/ + +# If you run "mix test --cover", coverage assets end up here. +/cover/ + +# The directory Mix downloads your dependencies sources to. +/deps/ + +# Where third-party dependencies like ExDoc output generated docs. +/doc/ + +# If the VM crashes, it generates a dump, let's ignore it too. +erl_crash.dump + +# Also ignore archive artifacts (built via "mix archive.build"). +*.ez + +# Ignore package tarball (built via "mix hex.build"). +fekex-*.tar + +# Temporary files, for example, from tests. +/tmp/ + +.tool-versions +data/ diff --git a/lib/fek_fetcher.ex b/lib/fek_fetcher.ex new file mode 100644 index 0000000..3c94c74 --- /dev/null +++ b/lib/fek_fetcher.ex @@ -0,0 +1,175 @@ +defmodule FekFetcher do + defmodule FekSearchResult do + @enforce_keys [ + :document_number, + :id, + :issue_date, + :issue_group_id, + :primary_label, + :publication_date + ] + defstruct [ + :document_number, + :id, + :issue_date, + :issue_group_id, + :primary_label, + :publication_date + ] + + @type t :: %__MODULE__{ + document_number: String.t(), + id: String.t(), + issue_date: String.t(), + issue_group_id: String.t(), + primary_label: String.t(), + publication_date: String.t() + } + + @spec from_search_result(map()) :: {:ok, t()} | {:error, String.t()} + def from_search_result(data) do + with {:ok, document_number} <- Map.fetch(data, "search_DocumentNumber"), + {:ok, id} <- Map.fetch(data, "search_ID"), + {:ok, issue_date} <- Map.fetch(data, "search_IssueDate"), + {:ok, issue_group_id} <- Map.fetch(data, "search_IssueGroupID"), + {:ok, primary_label} <- Map.fetch(data, "search_PrimaryLabel"), + {:ok, publication_date} <- Map.fetch(data, "search_PublicationDate") do + {:ok, + %__MODULE__{ + document_number: document_number, + id: id, + issue_date: issue_date, + issue_group_id: issue_group_id, + primary_label: primary_label, + publication_date: publication_date + }} + else + :error -> {:error, "Missing required fields in search result"} + end + end + + def issue_year(%__MODULE__{issue_date: issue_date}) do + [date, _time] = issue_date |> String.split(" ") + [_month, _day, issue_year] = date |> String.split("/") + + issue_year + end + end + + @base_url "https://searchetv99.azurewebsites.net/api/" + @pdf_base_url "https://ia37rg02wpsa01.blob.core.windows.net/fek/" + + defp get(url) do + case Req.get(url) do + {:ok, %Req.Response{status: 200, body: body}} -> + {:ok, body} + + {:ok, %Req.Response{status: status}} -> + {:error, "Request failed with status #{status}"} + + {:error, reason} -> + {:error, "Request failed: #{reason}"} + end + end + + defp post(url, data) do + case Req.post(url, json: data) do + {:ok, %Req.Response{status: 200, body: body}} -> + {:ok, body} + + {:ok, %Req.Response{status: status}} -> + {:error, "Request failed with status #{status}"} + + {:error, reason} -> + {:error, "Request failed: #{reason}"} + end + end + + defp pdf_url(%FekSearchResult{} = fek) do + document_number = fek.document_number |> String.pad_leading(5, "0") + issue_group_id = fek.issue_group_id |> String.pad_leading(2, "0") + issue_year = FekSearchResult.issue_year(fek) + + filename = "#{issue_year}#{issue_group_id}#{document_number}" + + "#{@pdf_base_url}#{issue_group_id}/#{issue_year}/#{filename}.pdf" + end + + + defp base_fek_path(%FekSearchResult{} = fek, base_path) do + year = FekSearchResult.issue_year(fek) + primary_label = fek.primary_label |> String.replace("/", "_") + + Path.join(base_path, year) |> Path.join(primary_label) + end + + defp spawn_tasks_to_get_json_metadata(%FekSearchResult{} = fek, dst) do + async_process = fn filename, url_path -> + Task.async(fn -> + file_path = Path.join(dst, filename) + with {:ok, %{"data" => data}} <- get(@base_url <> url_path) do + File.write!(file_path, data) + end + end) + end + + [ + %{filename: "metadata.json", path: "/documententitybyid/#{fek.id}"}, + %{filename: "timeline.json", path: "/timeline/#{fek.id}/0"}, + %{filename: "named_entity.json", path: "/namedentity/#{fek.id}"}, + %{filename: "tags.json", path: "/tagsbydocumententity/#{fek.id}"} + ] + |> Enum.map(&async_process.(&1.filename, &1.path)) + end + + defp get_pdf(%FekSearchResult{} = fek, dst) do + pdf_url = pdf_url(fek) + + with {:ok, pdf_data} <- get(pdf_url) do + file_path = Path.join(dst, "#{fek.document_number}.pdf") + File.write!(file_path, pdf_data) + {:ok, file_path} + else + {:error, reason} -> + {:error, "Failed to download PDF: #{reason}"} + end + end + + def process_fek_result(%FekSearchResult{} = fek, dst) do + fek_path = base_fek_path(fek, dst) + File.mkdir_p!(fek_path) + + tasks = spawn_tasks_to_get_json_metadata(fek, fek_path) + tasks = tasks ++ [Task.async(fn -> get_pdf(fek, fek_path) end)] + Task.await_many(tasks, :infinity) + + {:ok, fek_path} + end + + + def search_feks(years, issues) do + data = %{ + "selectYear" => years, + "selectIssue" => issues + } + + with {:ok, %{"data" => data}} <- post(@base_url <> "simplesearch", data), + {:ok, parsed} <- JSON.decode(data) do + feks = + Enum.map(parsed, fn item -> + case FekSearchResult.from_search_result(item) do + {:ok, fek} -> fek + {:error, reason} -> {:error, reason} + end + end) + Enum.filter(feks, fn + {:error, _} -> false + _ -> true + end) + + {:ok, feks} + else + {:error, reason} -> {:error, "Failed to fetch fek IDs: #{reason}"} + end + end +end diff --git a/lib/fekex.ex b/lib/fekex.ex new file mode 100644 index 0000000..00362be --- /dev/null +++ b/lib/fekex.ex @@ -0,0 +1,18 @@ +defmodule Fekex do + @moduledoc """ + Documentation for `Fekex`. + """ + + @doc """ + Hello world. + + ## Examples + + iex> Fekex.hello() + :world + + """ + def hello do + :world + end +end diff --git a/mix.exs b/mix.exs new file mode 100644 index 0000000..ed91ee9 --- /dev/null +++ b/mix.exs @@ -0,0 +1,28 @@ +defmodule Fekex.MixProject do + use Mix.Project + + def project do + [ + app: :fekex, + version: "0.1.0", + elixir: "~> 1.18", + start_permanent: Mix.env() == :prod, + deps: deps(), + ] + end + + # Run "mix help compile.app" to learn about applications. + def application do + [ + extra_applications: [:logger] + ] + end + + # Run "mix help deps" to learn about dependencies. + defp deps do + [ + {:req, "~> 0.5.0"}, + {:progress_bar, "~> 3.0"} + ] + end +end diff --git a/mix.lock b/mix.lock new file mode 100644 index 0000000..e6bdff7 --- /dev/null +++ b/mix.lock @@ -0,0 +1,23 @@ +%{ + "certifi": {:hex, :certifi, "2.15.0", "0e6e882fcdaaa0a5a9f2b3db55b1394dba07e8d6d9bcad08318fb604c6839712", [:rebar3], [], "hexpm", "b147ed22ce71d72eafdad94f055165c1c182f61a2ff49df28bcc71d1d5b94a60"}, + "decimal": {:hex, :decimal, "2.3.0", "3ad6255aa77b4a3c4f818171b12d237500e63525c2fd056699967a3e7ea20f62", [:mix], [], "hexpm", "a4d66355cb29cb47c3cf30e71329e58361cfcb37c34235ef3bf1d7bf3773aeac"}, + "file_system": {:hex, :file_system, "1.1.0", "08d232062284546c6c34426997dd7ef6ec9f8bbd090eb91780283c9016840e8f", [:mix], [], "hexpm", "bfcf81244f416871f2a2e15c1b515287faa5db9c6bcf290222206d120b3d43f6"}, + "finch": {:hex, :finch, "0.19.0", "c644641491ea854fc5c1bbaef36bfc764e3f08e7185e1f084e35e0672241b76d", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "fc5324ce209125d1e2fa0fcd2634601c52a787aff1cd33ee833664a5af4ea2b6"}, + "hackney": {:hex, :hackney, "1.24.1", "f5205a125bba6ed4587f9db3cc7c729d11316fa8f215d3e57ed1c067a9703fa9", [:rebar3], [{:certifi, "~> 2.15.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~> 6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~> 1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~> 1.4", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.4.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~> 1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "f4a7392a0b53d8bbc3eb855bdcc919cd677358e65b2afd3840b5b3690c4c8a39"}, + "hpax": {:hex, :hpax, "1.0.3", "ed67ef51ad4df91e75cc6a1494f851850c0bd98ebc0be6e81b026e765ee535aa", [:mix], [], "hexpm", "8eab6e1cfa8d5918c2ce4ba43588e894af35dbd8e91e6e55c817bca5847df34a"}, + "httpoison": {:hex, :httpoison, "2.2.3", "a599d4b34004cc60678999445da53b5e653630651d4da3d14675fedc9dd34bd6", [:mix], [{:hackney, "~> 1.21", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "fa0f2e3646d3762fdc73edb532104c8619c7636a6997d20af4003da6cfc53e53"}, + "idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"}, + "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"}, + "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"}, + "mime": {:hex, :mime, "2.0.7", "b8d739037be7cd402aee1ba0306edfdef982687ee7e9859bee6198c1e7e2f128", [:mix], [], "hexpm", "6171188e399ee16023ffc5b76ce445eb6d9672e2e241d2df6050f3c771e80ccd"}, + "mimerl": {:hex, :mimerl, "1.4.0", "3882a5ca67fbbe7117ba8947f27643557adec38fa2307490c4c4207624cb213b", [:rebar3], [], "hexpm", "13af15f9f68c65884ecca3a3891d50a7b57d82152792f3e19d88650aa126b144"}, + "mint": {:hex, :mint, "1.7.1", "113fdb2b2f3b59e47c7955971854641c61f378549d73e829e1768de90fc1abf1", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1 or ~> 0.2.0 or ~> 1.0", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "fceba0a4d0f24301ddee3024ae116df1c3f4bb7a563a731f45fdfeb9d39a231b"}, + "nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"}, + "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"}, + "parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"}, + "progress_bar": {:hex, :progress_bar, "3.0.0", "f54ff038c2ac540cfbb4c2bfe97c75e7116ead044f3c2b10c9f212452194b5cd", [:mix], [{:decimal, "~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "6981c2b25ab24aecc91a2dc46623658e1399c21a2ae24db986b90d678530f2b7"}, + "req": {:hex, :req, "0.5.10", "a3a063eab8b7510785a467f03d30a8d95f66f5c3d9495be3474b61459c54376c", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "8a604815743f8a2d3b5de0659fa3137fa4b1cffd636ecb69b30b2b9b2c2559be"}, + "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"}, + "telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"}, + "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.1", "a48703a25c170eedadca83b11e88985af08d35f37c6f664d6dcfb106a97782fc", [:rebar3], [], "hexpm", "b3a917854ce3ae233619744ad1e0102e05673136776fb2fa76234f3e03b23642"}, +} diff --git a/scripts/fetch.exs b/scripts/fetch.exs new file mode 100644 index 0000000..be055a1 --- /dev/null +++ b/scripts/fetch.exs @@ -0,0 +1,13 @@ +dst = "data" +years = System.argv() +{:ok, results} = FekFetcher.search_feks(years, ["2"]) + +total = Enum.count(results) +File.mkdir_p!(dst) + +results +|> Task.async_stream(fn fek -> FekFetcher.process_fek_result(fek, dst) end, timeout: :infinity, max_concurrency: 5) +|> Stream.with_index() +|> Enum.each(fn {_result, index} -> + ProgressBar.render(index, total) +end) diff --git a/test/fekex_test.exs b/test/fekex_test.exs new file mode 100644 index 0000000..c946eed --- /dev/null +++ b/test/fekex_test.exs @@ -0,0 +1,8 @@ +defmodule FekexTest do + use ExUnit.Case + doctest Fekex + + test "greets the world" do + assert Fekex.hello() == :world + end +end diff --git a/test/test_helper.exs b/test/test_helper.exs new file mode 100644 index 0000000..869559e --- /dev/null +++ b/test/test_helper.exs @@ -0,0 +1 @@ +ExUnit.start()