From 9fe9a002e52b07e9f3c97218ac3c8efe1f943a42 Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Tue, 23 Sep 2025 09:21:57 -0400 Subject: [PATCH 1/9] ETT-574: Test deposit to multiple linked pairtrees This simulates depositing to both the Isilon and TrueNAS at one site; it doesn't model deposit to the remote site. --- t/collate.t | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/t/collate.t b/t/collate.t index 3b36ed09..e5e94fc6 100644 --- a/t/collate.t +++ b/t/collate.t @@ -172,6 +172,59 @@ describe "HTFeed::Collate" => sub { ok($testlog->matches(qw(INFO.*already in repo))); }; + context "with multiple linked pairtrees" => sub { + my $old_storage_classes; + + before each => sub { + $old_storage_classes = get_config('storage_classes'); + my $new_storage_classes = { + 'linkedpairtree-test1' => + { + class => 'HTFeed::Storage::LinkedPairtree', + obj_dir => $tmpdirs->{obj_dir} . "/tree1", + link_dir => $tmpdirs->{link_dir} . "/tree1" + }, + 'linkedpairtree-test2' => + { + class => 'HTFeed::Storage::LinkedPairtree', + obj_dir => $tmpdirs->{obj_dir} . "/tree2", + link_dir => $tmpdirs->{link_dir} . "/tree2" + }, + }; + set_config($new_storage_classes,'storage_classes'); + }; + + after each => sub { + set_config($old_storage_classes,'storage_classes'); + }; + + it "copies and records to all configured storages" => sub { + my $volume = stage_volume($tmpdirs,'test','test'); + my $stage = HTFeed::Stage::Collate->new(volume => $volume); + $stage->run; + + my $dbh = get_dbh(); + my $audits = $dbh->selectall_arrayref("SELECT * from feed_audit WHERE namespace = 'test' and id = 'test'"); + + is(scalar(@{$audits}),1,'records an audit'); + + ok(-e "$tmpdirs->{obj_dir}/tree1/test/pairtree_root/te/st/test/test.mets.xml",'copies mets to local storage'); + ok(-e "$tmpdirs->{obj_dir}/tree1/test/pairtree_root/te/st/test/test.zip",'copies zip to local storage'); + + ok(-e "$tmpdirs->{link_dir}/tree1/test/pairtree_root/te/st/test/test.mets.xml",'links mets to local storage'); + ok(-e "$tmpdirs->{link_dir}/tree1/test/pairtree_root/te/st/test/test.zip",'links zip to local storage'); + + + ok(-e "$tmpdirs->{obj_dir}/tree2/test/pairtree_root/te/st/test/test.mets.xml",'copies mets to local storage 2'); + ok(-e "$tmpdirs->{obj_dir}/tree2/test/pairtree_root/te/st/test/test.zip",'copies zip to local storage 2'); + + ok(-e "$tmpdirs->{link_dir}/tree2/test/pairtree_root/te/st/test/test.mets.xml",'links mets to local storage 2'); + ok(-e "$tmpdirs->{link_dir}/tree2/test/pairtree_root/te/st/test/test.zip",'links zip to local storage 2'); + + ok($stage->succeeded); + }; + }; + context "with multiple real storage classes" => sub { spec_helper 's3_helper.pl'; From d7cbde2804dbab0eca4f365faaaaf3013630414d Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Tue, 23 Sep 2025 16:38:51 -0400 Subject: [PATCH 2/9] ETT-574: Tests for pairtree object store * Test depositing to multiple linked pairtrees (local) * Add versitygw for filesystem -> s3 gateway test * PairtreeObjectStore for deposit to filesystem via s3 gateway * collate test with this Still TODO: * Do we need to handle symlinking with the PairtreeObjectStore? (Would rather not..) * Unit tests for other aspects of PairtreeObjectStore * Consider how/where to store audit info for PairtreeObjectStore --- docker-compose.yml | 11 ++++++ lib/HTFeed/Stage/Collate.pm | 1 + lib/HTFeed/Storage/PairtreeObjectStore.pm | 39 +++++++++++++++++++ t/collate.t | 46 +++++++++++++++++++++++ t/s3_helper.pl | 2 +- 5 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 lib/HTFeed/Storage/PairtreeObjectStore.pm diff --git a/docker-compose.yml b/docker-compose.yml index ace5bb00..a33844de 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -131,6 +131,17 @@ services: <<: *healthcheck-defaults test: timeout 5s mc ready local + # S3 -> filesystem gateway + versitygw: + image: versity/versitygw + restart: always + environment: + ROOT_ACCESS_KEY: TESTINGACCESSKEY + ROOT_SECRET_KEY: testingsecretkey + volumes: + - ./var/vgw:/vgw + command: posix /vgw + pushgateway: image: prom/pushgateway command: diff --git a/lib/HTFeed/Stage/Collate.pm b/lib/HTFeed/Stage/Collate.pm index 81b5a587..0e5a5930 100644 --- a/lib/HTFeed/Stage/Collate.pm +++ b/lib/HTFeed/Stage/Collate.pm @@ -9,6 +9,7 @@ use Carp qw(croak); use HTFeed::Config qw(get_config); use HTFeed::Storage::LinkedPairtree; use HTFeed::Storage::LocalPairtree; +use HTFeed::Storage::PairtreeObjectStore; use HTFeed::Storage::ObjectStore; use HTFeed::Storage::PrefixedVersions; use Log::Log4perl qw(get_logger); diff --git a/lib/HTFeed/Storage/PairtreeObjectStore.pm b/lib/HTFeed/Storage/PairtreeObjectStore.pm new file mode 100644 index 00000000..89929093 --- /dev/null +++ b/lib/HTFeed/Storage/PairtreeObjectStore.pm @@ -0,0 +1,39 @@ +package HTFeed::Storage::PairtreeObjectStore; + +# Stores using the S3 protocol but with pairtree paths + +use HTFeed::Storage::ObjectStore; +use base qw(HTFeed::Storage::ObjectStore); + +use File::Pairtree qw(id2ppath s2ppchars); + +sub object_path { + my $self = shift; + + return sprintf( + '%s/%s%s/%s', + $self->{namespace}, + id2ppath($self->{objid}), + s2ppchars($self->{objid}) + ); +} + +sub zip_key { + my $self = shift; + + return $self->object_path . $self->{volume}->get_pt_objid() . $self->zip_suffix; + +} + +sub mets_key { + my $self = shift; + + return $self->object_path . $self->{volume}->get_mets_filename; +} + +sub record_audit { + # noop for now - maybe want to record info on the second site? + return 1; +} + +1; diff --git a/t/collate.t b/t/collate.t index e5e94fc6..606060d7 100644 --- a/t/collate.t +++ b/t/collate.t @@ -172,6 +172,52 @@ describe "HTFeed::Collate" => sub { ok($testlog->matches(qw(INFO.*already in repo))); }; + context "with PairtreeObjectStore" => sub { + my $s3; + my $bucket; + + before all => sub { + $bucket = "bucket" . sprintf("%08d",rand(1000000)); + $s3 = HTFeed::Storage::S3->new( + bucket => $bucket, + awscli => get_config('versitygw_awscli') + ); + $ENV{AWS_MAX_ATTEMPTS} = 1; + + $s3->mb; + }; + + my $old_storage_classes; + + before each => sub { + $old_storage_classes = get_config('storage_classes'); + my $new_storage_classes = { + 'pairtree_object_Store' => + { + class => 'HTFeed::Storage::PairtreeObjectStore', + bucket => $s3->{bucket}, + awscli => $s3->{awscli}, + } + }; + set_config($new_storage_classes,'storage_classes'); + }; + + after each => sub { + set_config($old_storage_classes,'storage_classes'); + }; + + it "copies and records to pairtree path" => sub { + my $volume = stage_volume($tmpdirs,'test','test'); + my $stage = HTFeed::Stage::Collate->new(volume => $volume); + $stage->run; + + ok(-e "$ENV{FEED_HOME}/var/vgw/$bucket/test/pairtree_root/te/st/test/test.mets.xml",'copies mets to pairtree in s3'); + ok(-e "$ENV{FEED_HOME}/var/vgw/$bucket/test/pairtree_root/te/st/test/test.zip",'copies zip to pairtree in s3'); + + ok($stage->succeeded); + }; + }; + context "with multiple linked pairtrees" => sub { my $old_storage_classes; diff --git a/t/s3_helper.pl b/t/s3_helper.pl index b6420358..490cf4b0 100644 --- a/t/s3_helper.pl +++ b/t/s3_helper.pl @@ -6,7 +6,7 @@ $bucket = "bucket" . sprintf("%08d",rand(1000000)); $s3 = HTFeed::Storage::S3->new( bucket => $bucket, - awscli => get_config('test_awscli') + awscli => get_config('minio_awscli') ); $ENV{AWS_MAX_ATTEMPTS} = 1; From c3668689d3115f9050aa7a288dfaaf2440c33ee1 Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Wed, 24 Sep 2025 14:59:15 -0400 Subject: [PATCH 3/9] ETT-574: Unit tests for PairtreeObjectStore Primarily testing the different paths vs. ObjectStore; stubs for testing different recording expectations and for behavior with symlinks --- lib/HTFeed/Storage/PairtreeObjectStore.pm | 2 +- t/storage_pairtree_object_store.t | 75 +++++++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 t/storage_pairtree_object_store.t diff --git a/lib/HTFeed/Storage/PairtreeObjectStore.pm b/lib/HTFeed/Storage/PairtreeObjectStore.pm index 89929093..453c6c7e 100644 --- a/lib/HTFeed/Storage/PairtreeObjectStore.pm +++ b/lib/HTFeed/Storage/PairtreeObjectStore.pm @@ -11,7 +11,7 @@ sub object_path { my $self = shift; return sprintf( - '%s/%s%s/%s', + '%s/%s%s/', $self->{namespace}, id2ppath($self->{objid}), s2ppchars($self->{objid}) diff --git a/t/storage_pairtree_object_store.t b/t/storage_pairtree_object_store.t new file mode 100644 index 00000000..2394967c --- /dev/null +++ b/t/storage_pairtree_object_store.t @@ -0,0 +1,75 @@ +use HTFeed::Config qw(get_config); +use Test::Spec; +use Test::Exception; +use HTFeed::Storage::PairtreeObjectStore; + +use strict; + +describe "HTFeed::Storage::PairtreeObjectStore" => sub { + spec_helper 'storage_helper.pl'; + local our ($tmpdirs, $testlog, $bucket, $s3); + + before all => sub { + $bucket = "bucket" . sprintf("%08d",rand(1000000)); + $s3 = HTFeed::Storage::S3->new( + bucket => $bucket, + awscli => get_config('versitygw_awscli') + ); + $ENV{AWS_MAX_ATTEMPTS} = 1; + + $s3->mb; + }; + + after all => sub { + $s3->rm('/',"--recursive"); + $s3->rb; + }; + + sub object_storage { + my $volume = stage_volume($tmpdirs,@_); + + my $storage = HTFeed::Storage::PairtreeObjectStore->new( + name => 'pairtreeobjectstore-test', + volume => $volume, + config => { + bucket => $s3->{bucket}, + awscli => $s3->{awscli} + }, + ); + + return $storage; + } + + describe "#object_path" => sub { + it "includes the namespace, pairtree path, and pairtreeized object id" => sub { + my $storage = object_storage('test','ark:/123456/abcde'); + + is($storage->object_path, "test/pairtree_root/ar/k+/=1/23/45/6=/ab/cd/e/ark+=123456=abcde/"); + }; + }; + + describe "#move" => sub { + before each => sub { + $s3->rm("/","--recursive"); + }; + + it "uploads zip and mets" => sub { + my $storage = object_storage('test','test'); + $storage->move; + + ok($s3->s3_has("test/pairtree_root/te/st/test/test.zip")); + ok($s3->s3_has("test/pairtree_root/te/st/test/test.mets.xml")); + }; + + }; + + describe "#record_audit" => sub { + it "records the item info in the feed_audit table"; + it "does something with the sdr bucket"; + }; + + it "deals with old symlinks"; + +}; + +runtests unless caller; From 0c456ee4d0c347d8f468180b4f24d171f0c5cd7a Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Fri, 26 Sep 2025 16:33:51 -0400 Subject: [PATCH 4/9] ETT-574: Test writing through symlinks with versitygw; remove minio * no need for two different s3 gateways in docker compose setup * remove duplicative setup for s3 in tests * test that we can transparently write through an existing symlink to a directory with minio * add dependencies & health checks in docker compose for versitygw * set permissions for versitygw directory in github actions --- .github/workflows/tests.yml | 7 +++- docker-compose.yml | 25 +++++-------- etc/config_test.yml | 5 +-- t/collate.t | 15 +------- t/s3_helper.pl | 2 +- t/storage_pairtree_object_store.t | 61 +++++++++++++++++++++---------- var/vgw/.keep | 0 7 files changed, 62 insertions(+), 53 deletions(-) create mode 100644 var/vgw/.keep diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b9ce44e9..5512952e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,9 +16,14 @@ jobs: steps: - uses: actions/checkout@v4 + # runner runs as userid 1001 but userid 1000 is baked into docker image. + # we could adjust this if needed via env var but this should work + - name: Adjust permissions for versitygw directory + run: chmod 777 var/vgw + - name: Build docker image run: docker compose build - + - name: Run tests run: docker compose run test-and-cover env: diff --git a/docker-compose.yml b/docker-compose.yml index a33844de..f147eee9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -25,9 +25,9 @@ services: command: prove depends_on: mariadb: *healthy - minio: *healthy pushgateway: *healthy rabbitmq: *healthy + versitygw: *healthy # Note: for permissions purposes this does NOT bind in the local development # environment, so local changes after running docker compose build will NOT @@ -36,6 +36,7 @@ services: build: . volumes: - ./clamav:/var/lib/clamav + - ./var/vgw:/usr/local/feed/var/vgw environment: - HTFEED_CONFIG=/usr/local/feed/etc/config_test.yml - FEED_HOME=/usr/local/feed @@ -53,9 +54,9 @@ services: command: cover -test -report Coveralls -make 'prove; exit $?' depends_on: mariadb: *healthy - minio: *healthy pushgateway: *healthy rabbitmq: *healthy + versitygw: *healthy ingest: build: . @@ -119,28 +120,20 @@ services: <<: *healthcheck-defaults test: ["CMD", "healthcheck.sh", "--su-mysql", "--connect", "--innodb_initialized"] - # S3 compatible object storage - minio: - image: minio/minio - restart: always - environment: - MINIO_ACCESS_KEY: TESTINGACCESSKEY - MINIO_SECRET_KEY: testingsecretkey - command: server /data - healthcheck: - <<: *healthcheck-defaults - test: timeout 5s mc ready local - # S3 -> filesystem gateway versitygw: + user: "1000:1000" image: versity/versitygw restart: always environment: ROOT_ACCESS_KEY: TESTINGACCESSKEY ROOT_SECRET_KEY: testingsecretkey volumes: - - ./var/vgw:/vgw - command: posix /vgw + - ./var/vgw:/usr/local/feed/var/vgw + command: --health /health posix /usr/local/feed/var/vgw + healthcheck: + <<: *healthcheck-defaults + test: [ "CMD", "wget", "--quiet", "--tries=1", "-O", "/dev/null", "http://127.0.0.1:7070/health" ] pushgateway: image: prom/pushgateway diff --git a/etc/config_test.yml b/etc/config_test.yml index ac060ff5..56faa46d 100644 --- a/etc/config_test.yml +++ b/etc/config_test.yml @@ -25,7 +25,7 @@ emma: namespace: test packagetype: emma bucket: emma-test-bucket - awscli: ['aws', '--endpoint-url', 'http://minio:9000'] + awscli: ['aws', '--endpoint-url', 'http://versitygw:7070'] rabbitmq: host: rabbitmq @@ -34,11 +34,10 @@ rabbitmq: queue: testqueue priority_levels: 3 -test_awscli: ['aws', '--endpoint-url', 'http://minio:9000'] +awscli: ['aws', '--endpoint-url', 'http://versitygw:7070'] pushgateway: http://pushgateway:9091 - # To configure in production handle: diff --git a/t/collate.t b/t/collate.t index 606060d7..756dffb1 100644 --- a/t/collate.t +++ b/t/collate.t @@ -173,19 +173,8 @@ describe "HTFeed::Collate" => sub { }; context "with PairtreeObjectStore" => sub { - my $s3; - my $bucket; - - before all => sub { - $bucket = "bucket" . sprintf("%08d",rand(1000000)); - $s3 = HTFeed::Storage::S3->new( - bucket => $bucket, - awscli => get_config('versitygw_awscli') - ); - $ENV{AWS_MAX_ATTEMPTS} = 1; - - $s3->mb; - }; + spec_helper 's3_helper.pl'; + local our ($s3, $bucket); my $old_storage_classes; diff --git a/t/s3_helper.pl b/t/s3_helper.pl index 490cf4b0..63edef93 100644 --- a/t/s3_helper.pl +++ b/t/s3_helper.pl @@ -6,7 +6,7 @@ $bucket = "bucket" . sprintf("%08d",rand(1000000)); $s3 = HTFeed::Storage::S3->new( bucket => $bucket, - awscli => get_config('minio_awscli') + awscli => get_config('awscli') ); $ENV{AWS_MAX_ATTEMPTS} = 1; diff --git a/t/storage_pairtree_object_store.t b/t/storage_pairtree_object_store.t index 2394967c..e8579672 100644 --- a/t/storage_pairtree_object_store.t +++ b/t/storage_pairtree_object_store.t @@ -1,28 +1,32 @@ use HTFeed::Config qw(get_config); use Test::Spec; use Test::Exception; +use File::Temp qw(tempdir); +use File::Basename qw(basename); +use File::Path qw(make_path remove_tree); use HTFeed::Storage::PairtreeObjectStore; use strict; describe "HTFeed::Storage::PairtreeObjectStore" => sub { spec_helper 'storage_helper.pl'; - local our ($tmpdirs, $testlog, $bucket, $s3); + spec_helper 's3_helper.pl'; - before all => sub { - $bucket = "bucket" . sprintf("%08d",rand(1000000)); - $s3 = HTFeed::Storage::S3->new( - bucket => $bucket, - awscli => get_config('versitygw_awscli') - ); - $ENV{AWS_MAX_ATTEMPTS} = 1; + my $vgw_home = "$ENV{FEED_HOME}/var/vgw"; + local our ($tmpdirs, $testlog, $bucket, $s3, $objdir, $bucket_dir); - $s3->mb; + before each => sub { + $s3->rm("/","--recursive"); + }; + + before all => sub { + $bucket_dir = "$vgw_home/$bucket"; + $objdir = "$vgw_home/$bucket-obj"; + make_path($objdir); }; after all => sub { - $s3->rm('/',"--recursive"); - $s3->rb; + remove_tree($objdir,$bucket_dir); }; sub object_storage { @@ -49,26 +53,45 @@ describe "HTFeed::Storage::PairtreeObjectStore" => sub { }; describe "#move" => sub { - before each => sub { - $s3->rm("/","--recursive"); - }; - it "uploads zip and mets" => sub { my $storage = object_storage('test','test'); + my $pt_path = "test/pairtree_root/te/st/test"; $storage->move; - ok($s3->s3_has("test/pairtree_root/te/st/test/test.zip")); - ok($s3->s3_has("test/pairtree_root/te/st/test/test.mets.xml")); + # should be in the bucket and also visible in the filesystem + ok($s3->s3_has("$pt_path/test.zip")); + ok($s3->s3_has("$pt_path/test.mets.xml")); + ok(-s "$bucket_dir/$pt_path/test.zip"); + ok(-s "$bucket_dir/$pt_path/test.mets.xml"); }; }; describe "#record_audit" => sub { it "records the item info in the feed_audit table"; - it "does something with the sdr bucket"; }; - it "deals with old symlinks"; + it "writes through existing symlinks" => sub { + + my $pt_prefix = "test/pairtree_root/te/st"; + + # set things up using filesystem access rather than via s3 + make_path("$objdir/$pt_prefix/test","$bucket_dir/$pt_prefix"); + system("touch $objdir/$pt_prefix/test/test.zip"); + system("touch $objdir/$pt_prefix/test/test.mets.xml"); + system("ln -sv $objdir/$pt_prefix/test $bucket_dir/$pt_prefix/test"); + + # writes via the symlink in $bucket_dir + my $storage = object_storage('test','test'); + $storage->move; + + # started as zero size (via touch), should be nonzero size now + ok(-s "$objdir/$pt_prefix/test/test.zip"); + ok(-s "$objdir/$pt_prefix/test/test.mets.xml"); + + # should still be a link in the bucket dir + ok(-l "$bucket_dir/$pt_prefix/test"); + }; }; diff --git a/var/vgw/.keep b/var/vgw/.keep new file mode 100644 index 00000000..e69de29b From 2f1cb52de97b627262b1d48ce53b662598041b69 Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Fri, 26 Sep 2025 17:08:58 -0400 Subject: [PATCH 5/9] ETT-574: collate test simulating new storage simulates deposit with: * one LinkedPairtree (current storage) * two different PairtreeObjectStores (new storage at both sites) * one ObjectStore (glacier deep archive) * one PrefixedVersions (data den) removes tests for * two linkedpairtrees (not going to do this) * single PairtreeObjectStore (duplicative of storage_pairtree_object_store.t) --- t/collate.t | 137 +++++++++++++++++----------------------------------- 1 file changed, 44 insertions(+), 93 deletions(-) diff --git a/t/collate.t b/t/collate.t index 756dffb1..a9c94e13 100644 --- a/t/collate.t +++ b/t/collate.t @@ -172,120 +172,65 @@ describe "HTFeed::Collate" => sub { ok($testlog->matches(qw(INFO.*already in repo))); }; - context "with PairtreeObjectStore" => sub { + context "with multiple real storage classes" => sub { spec_helper 's3_helper.pl'; - local our ($s3, $bucket); + local our ($bucket, $s3); my $old_storage_classes; - - before each => sub { - $old_storage_classes = get_config('storage_classes'); - my $new_storage_classes = { - 'pairtree_object_Store' => - { - class => 'HTFeed::Storage::PairtreeObjectStore', - bucket => $s3->{bucket}, - awscli => $s3->{awscli}, - } - }; - set_config($new_storage_classes,'storage_classes'); + my %s3s; + + before all => sub { + foreach my $suffix (qw(ptobj1 ptobj2 backup)) { + $s3s{$suffix} = HTFeed::Storage::S3->new( + bucket => "$bucket-$suffix", + awscli => get_config('awscli') + ); + $s3s{$suffix}->mb; + } }; - after each => sub { - set_config($old_storage_classes,'storage_classes'); - }; - - it "copies and records to pairtree path" => sub { - my $volume = stage_volume($tmpdirs,'test','test'); - my $stage = HTFeed::Stage::Collate->new(volume => $volume); - $stage->run; - - ok(-e "$ENV{FEED_HOME}/var/vgw/$bucket/test/pairtree_root/te/st/test/test.mets.xml",'copies mets to pairtree in s3'); - ok(-e "$ENV{FEED_HOME}/var/vgw/$bucket/test/pairtree_root/te/st/test/test.zip",'copies zip to pairtree in s3'); - - ok($stage->succeeded); + after all => sub { + foreach my $s3 (values(%s3s)) { + $s3->rm('/',"--recursive"); + $s3->rb; + } }; - }; - - context "with multiple linked pairtrees" => sub { - my $old_storage_classes; - - before each => sub { - $old_storage_classes = get_config('storage_classes'); - my $new_storage_classes = { - 'linkedpairtree-test1' => - { - class => 'HTFeed::Storage::LinkedPairtree', - obj_dir => $tmpdirs->{obj_dir} . "/tree1", - link_dir => $tmpdirs->{link_dir} . "/tree1" - }, - 'linkedpairtree-test2' => - { - class => 'HTFeed::Storage::LinkedPairtree', - obj_dir => $tmpdirs->{obj_dir} . "/tree2", - link_dir => $tmpdirs->{link_dir} . "/tree2" - }, - }; - set_config($new_storage_classes,'storage_classes'); - }; - - after each => sub { - set_config($old_storage_classes,'storage_classes'); - }; - - it "copies and records to all configured storages" => sub { - my $volume = stage_volume($tmpdirs,'test','test'); - my $stage = HTFeed::Stage::Collate->new(volume => $volume); - $stage->run; - - my $dbh = get_dbh(); - my $audits = $dbh->selectall_arrayref("SELECT * from feed_audit WHERE namespace = 'test' and id = 'test'"); - - is(scalar(@{$audits}),1,'records an audit'); - - ok(-e "$tmpdirs->{obj_dir}/tree1/test/pairtree_root/te/st/test/test.mets.xml",'copies mets to local storage'); - ok(-e "$tmpdirs->{obj_dir}/tree1/test/pairtree_root/te/st/test/test.zip",'copies zip to local storage'); - - ok(-e "$tmpdirs->{link_dir}/tree1/test/pairtree_root/te/st/test/test.mets.xml",'links mets to local storage'); - ok(-e "$tmpdirs->{link_dir}/tree1/test/pairtree_root/te/st/test/test.zip",'links zip to local storage'); - - - ok(-e "$tmpdirs->{obj_dir}/tree2/test/pairtree_root/te/st/test/test.mets.xml",'copies mets to local storage 2'); - ok(-e "$tmpdirs->{obj_dir}/tree2/test/pairtree_root/te/st/test/test.zip",'copies zip to local storage 2'); - - ok(-e "$tmpdirs->{link_dir}/tree2/test/pairtree_root/te/st/test/test.mets.xml",'links mets to local storage 2'); - ok(-e "$tmpdirs->{link_dir}/tree2/test/pairtree_root/te/st/test/test.zip",'links zip to local storage 2'); - - ok($stage->succeeded); - }; - }; - - context "with multiple real storage classes" => sub { - spec_helper 's3_helper.pl'; - - local our ($bucket, $s3); - my $old_storage_classes; before each => sub { $old_storage_classes = get_config('storage_classes'); my $new_storage_classes = { + # simulating isilon 'linkedpairtree-test' => { class => 'HTFeed::Storage::LinkedPairtree', obj_dir => $tmpdirs->{obj_dir}, link_dir => $tmpdirs->{link_dir} }, + # simulating truenas (site 1) + 'pairtreeobjectstore-ptobj1' => { + class => 'HTFeed::Storage::PairtreeObjectStore', + bucket => $s3s{ptobj1}->{bucket}, + awscli => $s3s{ptobj1}->{awscli}, + }, + # simulating truenas (site 2) + 'pairtreeobjectstore-ptobj2' => { + class => 'HTFeed::Storage::PairtreeObjectStore', + bucket => $s3s{ptobj2}->{bucket}, + awscli => $s3s{ptobj2}->{awscli}, + }, + # simulating data den 'prefixedversions-test' => { class => 'HTFeed::Storage::PrefixedVersions', obj_dir => $tmpdirs->{backup_obj_dir}, encryption_key => $tmpdirs->test_home . "/fixtures/encryption_key" }, + # simulating glacier deep archive 'objectstore-test' => { class => 'HTFeed::Storage::ObjectStore', - bucket => $s3->{bucket}, - awscli => $s3->{awscli}, + bucket => $s3s{backup}->{bucket}, + awscli => $s3s{backup}->{awscli}, encryption_key => $tmpdirs->test_home . "/fixtures/encryption_key" } }; @@ -311,16 +256,22 @@ describe "HTFeed::Collate" => sub { is(scalar(@{$s3_backup}),1,'records a backup for object store'); my $timestamp = $versioned_backup->[0][0]; - ok(-e "$tmpdirs->{obj_dir}/test/pairtree_root/te/st/test/test.mets.xml",'copies mets to local storage'); - ok(-e "$tmpdirs->{obj_dir}/test/pairtree_root/te/st/test/test.zip",'copies zip to local storage'); + + my $pt_path = "test/pairtree_root/te/st/test"; + ok(-e "$tmpdirs->{obj_dir}/$pt_path/test.mets.xml",'copies mets to local storage'); + ok(-e "$tmpdirs->{obj_dir}/$pt_path/test.zip",'copies zip to local storage'); ok(-e "$tmpdirs->{backup_obj_dir}/test/tes/test.$timestamp.zip.gpg","copies the encrypted zip to backup storage"); ok(-e "$tmpdirs->{backup_obj_dir}/test/tes/test.$timestamp.mets.xml","copies the mets backup storage"); my $s3_timestamp = $s3_backup->[0][0]; - ok($s3->s3_has("test.test.$s3_timestamp.zip.gpg")); - ok($s3->s3_has("test.test.$s3_timestamp.mets.xml")); + ok($s3s{ptobj1}->s3_has("$pt_path/test.mets.xml")); + ok($s3s{ptobj1}->s3_has("$pt_path/test.zip")); + ok($s3s{ptobj2}->s3_has("$pt_path/test.mets.xml")); + ok($s3s{ptobj2}->s3_has("$pt_path/test.zip")); + ok($s3s{backup}->s3_has("test.test.$s3_timestamp.zip.gpg")); + ok($s3s{backup}->s3_has("test.test.$s3_timestamp.mets.xml")); ok(! -e "$tmpdirs->{zip}/test/00000001.jp2","cleans up the extracted zip files"); ok(! -e "$tmpdirs->{zip}/test","cleans up the zip file tmpdir"); From 62489c2614578cd69c407a91b4511947062a2d8f Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Mon, 29 Sep 2025 14:43:49 -0400 Subject: [PATCH 6/9] ETT-574: Record deposit with PairtreeObjectStore in feed_storage * Avoid re-using feed_audit as it has extra info not related to the deposit (page count, etc) * Will want to think about retiring/migrating info from feed_audit when retiring old storage * Implement zip_size /mets_size for ObjectStore and PairtreeObjectStore --- etc/ingest.sql | 14 ++++++++++++ lib/HTFeed/Storage/ObjectStore.pm | 26 +++++++++++++++++------ lib/HTFeed/Storage/PairtreeObjectStore.pm | 25 ++++++++++++++++++++-- t/storage_pairtree_object_store.t | 15 ++++++++++++- 4 files changed, 71 insertions(+), 9 deletions(-) diff --git a/etc/ingest.sql b/etc/ingest.sql index 43825c9c..17e95127 100644 --- a/etc/ingest.sql +++ b/etc/ingest.sql @@ -127,6 +127,20 @@ CREATE TABLE IF NOT EXISTS `feed_backups` ( KEY `feed_backups_version` (`version`) ); +CREATE TABLE IF NOT EXISTS `feed_storage` ( + `namespace` varchar(10) NOT NULL, + `id` varchar(32) NOT NULL, + `storage_name` varchar(32) NOT NULL, + `zip_size` bigint(20) DEFAULT NULL, + `mets_size` bigint(20) DEFAULT NULL, + `saved_md5sum` char(32) DEFAULT NULL, + `deposit_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + `lastchecked` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + `lastmd5check` timestamp NULL DEFAULT NULL, + `md5check_ok` tinyint(1) DEFAULT NULL, + PRIMARY KEY (`namespace`, `id`, `storage_name`) +); + CREATE TABLE IF NOT EXISTS `feed_audit_detail` ( `namespace` varchar(10) NOT NULL, `id` varchar(30) NOT NULL, diff --git a/lib/HTFeed/Storage/ObjectStore.pm b/lib/HTFeed/Storage/ObjectStore.pm index 683f383c..d0d9e121 100644 --- a/lib/HTFeed/Storage/ObjectStore.pm +++ b/lib/HTFeed/Storage/ObjectStore.pm @@ -96,6 +96,16 @@ sub mets_key { return $self->object_path . ".mets.xml"; } +sub zip_size { + my $self = shift; + return $self->{filesize}{$self->zip_key}, +} + +sub mets_size { + my $self = shift; + return $self->{filesize}{$self->mets_key}, +} + sub zip_filename { my $self = shift; @@ -208,15 +218,19 @@ sub record_audit { $self->record_backup; } +sub saved_md5sum { + my $self = shift; + + my $b64_checksum = $self->{checksums}{$self->zip_key}; + my $hex_checksum = unpack("H*", decode_base64($b64_checksum)); +} + sub record_backup { my $self = shift; get_logger->trace(" starting record_backup"); my $dbh = HTFeed::DBTools::get_dbh(); - my $b64_checksum = $self->{checksums}{$self->zip_key}; - my $hex_checksum = unpack("H*", decode_base64($b64_checksum)); - my $stmt = join( " ", "INSERT INTO feed_backups", @@ -232,9 +246,9 @@ sub record_backup { $self->audit_path, $self->{timestamp}, $self->{name}, - $self->{filesize}{$self->zip_key}, - $self->{filesize}{$self->object_path . '.mets.xml'}, - $hex_checksum + $self->zip_size, + $self->mets_size, + $self->saved_md5sum ); get_logger->trace(" finished record_backup"); diff --git a/lib/HTFeed/Storage/PairtreeObjectStore.pm b/lib/HTFeed/Storage/PairtreeObjectStore.pm index 453c6c7e..2ca42426 100644 --- a/lib/HTFeed/Storage/PairtreeObjectStore.pm +++ b/lib/HTFeed/Storage/PairtreeObjectStore.pm @@ -5,6 +5,7 @@ package HTFeed::Storage::PairtreeObjectStore; use HTFeed::Storage::ObjectStore; use base qw(HTFeed::Storage::ObjectStore); +use HTFeed::DBTools qw(get_dbh); use File::Pairtree qw(id2ppath s2ppchars); sub object_path { @@ -32,8 +33,28 @@ sub mets_key { } sub record_audit { - # noop for now - maybe want to record info on the second site? - return 1; + my $self = shift; + + my $stmt = + "insert into feed_storage (namespace, id, storage_name, zip_size, mets_size, saved_md5sum, deposit_time, lastchecked, lastmd5check, md5check_ok) \ + values(?,?,?,?,?,?,CURRENT_TIMESTAMP,CURRENT_TIMESTAMP,CURRENT_TIMESTAMP,1) \ + ON DUPLICATE KEY UPDATE zip_size=?, mets_size=?, saved_md5sum=?, deposit_time=CURRENT_TIMESTAMP, lastchecked = CURRENT_TIMESTAMP,lastmd5check = CURRENT_TIMESTAMP, md5check_ok = 1"; + + my $storage_name = $self->{name}; + my $saved_md5sum = $self->saved_md5sum; + + my $zip_size = $self->zip_size; + my $mets_size = $self->mets_size; + + my $sth = get_dbh()->prepare($stmt); + my $res = $sth->execute( + $self->{namespace}, $self->{objid}, $storage_name, + $zipsize, $metssize, $saved_md5sum, + # duplicate parameters for duplicate key update + $zipsize, $metssize, $saved_md5sum + ); + + return $res; } 1; diff --git a/t/storage_pairtree_object_store.t b/t/storage_pairtree_object_store.t index e8579672..da344c28 100644 --- a/t/storage_pairtree_object_store.t +++ b/t/storage_pairtree_object_store.t @@ -68,7 +68,20 @@ describe "HTFeed::Storage::PairtreeObjectStore" => sub { }; describe "#record_audit" => sub { - it "records the item info in the feed_audit table"; + it "records the item info in the feed_storage table" => sub { + my $dbh = get_dbh(); + + my $storage = object_storage('test','test'); + $storage->stage; + $storage->make_object_path; + $storage->move; + $storage->record_audit; + + my $r = $dbh->selectall_arrayref("SELECT * from feed_storage WHERE namespace = 'test' and id = 'test' and storage_name='pairtreeobjectstore-test'"); + + ok($r->[0][0]); + + }; }; it "writes through existing symlinks" => sub { From 41e71161b67e99d92d843bde03374c73b0c0c044 Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Mon, 6 Oct 2025 13:26:48 -0400 Subject: [PATCH 7/9] Fix failing tests on Mac The Mac filesystem is case-preserving but not case sensitive, and t/lib/HTFeed/Namespace/Test.pm appears to conflict with lib/HTFeed/Namespace/TEST.pm. This appears to fix that issue --- t/lib/HTFeed/Namespace/{Test.pm => ClassTest.pm} | 2 +- t/lib/HTFeed/Test/Class.pm | 4 +++- t/lib/HTFeed/Test/Support.pm | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) rename t/lib/HTFeed/Namespace/{Test.pm => ClassTest.pm} (77%) diff --git a/t/lib/HTFeed/Namespace/Test.pm b/t/lib/HTFeed/Namespace/ClassTest.pm similarity index 77% rename from t/lib/HTFeed/Namespace/Test.pm rename to t/lib/HTFeed/Namespace/ClassTest.pm index 20c5c3f8..a06b876e 100644 --- a/t/lib/HTFeed/Namespace/Test.pm +++ b/t/lib/HTFeed/Namespace/ClassTest.pm @@ -1,4 +1,4 @@ -package HTFeed::Namespace::Test; +package HTFeed::Namespace::ClassTest; use warnings; use strict; diff --git a/t/lib/HTFeed/Test/Class.pm b/t/lib/HTFeed/Test/Class.pm index d7318bb9..24aefb5c 100644 --- a/t/lib/HTFeed/Test/Class.pm +++ b/t/lib/HTFeed/Test/Class.pm @@ -7,10 +7,12 @@ use HTFeed::Config qw(get_config); use File::Path qw(remove_tree); # return testing class, with assumption that $class eq "$testing_class::Test" +# or for example "$testing_class::SomethingTest" + sub testing_class{ my $self = shift; my $class = ref $self; - $class =~ s/::Test$//; + $class =~ s/::\w*Test$//; return $class; } diff --git a/t/lib/HTFeed/Test/Support.pm b/t/lib/HTFeed/Test/Support.pm index b38beba0..e3c7f34b 100644 --- a/t/lib/HTFeed/Test/Support.pm +++ b/t/lib/HTFeed/Test/Support.pm @@ -56,9 +56,10 @@ my @test_classes; my $libDir = "$FindBin::Bin/lib/"; # get the path to each test classes find(sub{ - if (-f and $_ =~ /^Test\.pm$/ ){ + if (-f and $_ =~ /Test\.pm$/ ){ my $name = $File::Find::name; $name =~ s/$libDir//; + return if $name =~ /AbstractTest\.pm$/; push @test_classes, $name; } }, $libDir From d4ae2688122cb3a36dff51d293a0ca1b59a471b4 Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Mon, 6 Oct 2025 14:59:38 -0400 Subject: [PATCH 8/9] Use /tmp/gnupg for the GnuPG home directory I believe tests were failing because of issues regarding the socket files mounted in from the host filesystem. We're already using /tmp/gnupg in the feed-internal Dockerfile (which we can remove when we update to this version of feed), so this shouldn't cause any issues. --- Dockerfile | 7 +++++-- docker-compose.yml | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index a3d7c119..8fb2c6d0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,6 +71,11 @@ ENV PERL5LIB="/extlib/lib/perl5:$FEED_HOME/lib" COPY ./src/validateCache.cpp /usr/src/validateCache.cpp RUN /usr/bin/g++ -o /usr/local/bin/validate-cache /usr/src/validateCache.cpp -lxerces-c +ENV GNUPGHOME=/tmp/gnupg +RUN mkdir $GNUPGHOME +RUN chown $UID:$GID $GNUPGHOME +RUN chmod 700 $GNUPGHOME + USER $UID:$GID WORKDIR $FEED_HOME @@ -87,8 +92,6 @@ RUN mkdir -p /tmp/stage/grin RUN mkdir -p /tmp/prep/toingest /tmp/prep/failed /tmp/prep/ingested /tmp/prep/logs /tmp/prep/toingest/emma RUN mkdir $FEED_HOME/bin $FEED_HOME/src $FEED_HOME/.gnupg -RUN chown $UID:$GID $FEED_HOME/.gnupg -RUN chmod 700 $FEED_HOME/.gnupg COPY . $FEED_HOME diff --git a/docker-compose.yml b/docker-compose.yml index f147eee9..7b3ed668 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -130,7 +130,8 @@ services: ROOT_SECRET_KEY: testingsecretkey volumes: - ./var/vgw:/usr/local/feed/var/vgw - command: --health /health posix /usr/local/feed/var/vgw + - ./var/metadata_vgw:/usr/local/feed/var/metadata_vgw + command: --health /health posix --sidecar /usr/local/feed/var/metadata_vgw /usr/local/feed/var/vgw healthcheck: <<: *healthcheck-defaults test: [ "CMD", "wget", "--quiet", "--tries=1", "-O", "/dev/null", "http://127.0.0.1:7070/health" ] @@ -156,4 +157,4 @@ volumes: repository_link: repository_obj: backups: - rclone: + vgw_sidecar: From 7f01e397b6bafaccfe00e4f5983c44b84819ffd6 Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Mon, 6 Oct 2025 15:12:50 -0400 Subject: [PATCH 9/9] Try to get versitygw tests working in both Mac & Github --- .github/workflows/tests.yml | 4 ++-- var/metadata_vgw/.keep | 0 2 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 var/metadata_vgw/.keep diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5512952e..79647a17 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,8 +18,8 @@ jobs: # runner runs as userid 1001 but userid 1000 is baked into docker image. # we could adjust this if needed via env var but this should work - - name: Adjust permissions for versitygw directory - run: chmod 777 var/vgw + - name: Adjust permissions for versitygw directories + run: chmod 777 var/vgw var/metadata_vgw - name: Build docker image run: docker compose build diff --git a/var/metadata_vgw/.keep b/var/metadata_vgw/.keep new file mode 100644 index 00000000..e69de29b