From 1b46cde3c76cd1dc6943010de98ac6d172d97b1c Mon Sep 17 00:00:00 2001 From: Thomas Berezansky Date: Tue, 9 Aug 2016 11:55:03 -0400 Subject: [PATCH 1/3] Modify USMARC _next function The record terminator is also occasionally *in* records, so attempt to see if what follows it is another record or a continuation of the current one. In the latter case, keep reading the record. Signed-off-by: Thomas Berezansky --- marc-record/lib/MARC/File/USMARC.pm | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/marc-record/lib/MARC/File/USMARC.pm b/marc-record/lib/MARC/File/USMARC.pm index 5719814..7b6afd3 100644 --- a/marc-record/lib/MARC/File/USMARC.pm +++ b/marc-record/lib/MARC/File/USMARC.pm @@ -47,11 +47,26 @@ sub _next { my $self = shift; my $fh = $self->{fh}; - my $reclen; - return if eof($fh); - local $/ = END_OF_RECORD; - my $usmarc = <$fh>; + + # Return if we have no more file to read, and there is nothing in the buffer + return if eof($fh) and not defined($self->{buffer}); + + # Pull from the buffer or the first record in the file + my $usmarc = $self->{buffer} || <$fh>; + + # Add to $usmarc until the end of file or we see what looks like a new record + while (1) { + if (eof($fh)) { + delete($self->{buffer}); + last; + } else { + $self->{buffer} = <$fh>; + # This is an attempt at detecting a leader + last if ($self->{buffer} =~ /^[ \x00\x0a\x0d\x1a]*[0-9]{5}[acdnposx][acdefgijkmoprtz]...22/); + $usmarc = join($usmarc, END_OF_RECORD, $self->{buffer}); + } + }; # remove illegal garbage that sometimes occurs between records $usmarc =~ s/^[ \x00\x0a\x0d\x1a]+//; From 135d324d771dcb94dc8c64441658b2108cc1b0e7 Mon Sep 17 00:00:00 2001 From: Thomas Berezansky Date: Tue, 9 Aug 2016 12:37:54 -0400 Subject: [PATCH 2/3] End of record is already there, don't double it Signed-off-by: Thomas Berezansky --- marc-record/lib/MARC/File/USMARC.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marc-record/lib/MARC/File/USMARC.pm b/marc-record/lib/MARC/File/USMARC.pm index 7b6afd3..e7ee857 100644 --- a/marc-record/lib/MARC/File/USMARC.pm +++ b/marc-record/lib/MARC/File/USMARC.pm @@ -64,7 +64,7 @@ sub _next { $self->{buffer} = <$fh>; # This is an attempt at detecting a leader last if ($self->{buffer} =~ /^[ \x00\x0a\x0d\x1a]*[0-9]{5}[acdnposx][acdefgijkmoprtz]...22/); - $usmarc = join($usmarc, END_OF_RECORD, $self->{buffer}); + $usmarc .= $self->{buffer}; } }; From 94f9bd83a30588f69c57f18c5b720fc9ffe1633e Mon Sep 17 00:00:00 2001 From: Thomas Berezansky Date: Tue, 9 Aug 2016 12:44:57 -0400 Subject: [PATCH 3/3] Clean junk after the record Because we pull from after the record terminator we may pick up junk from the end of the file, so clean up any of that we run into. Signed-off-by: Thomas Berezansky --- marc-record/lib/MARC/File/USMARC.pm | 3 +++ 1 file changed, 3 insertions(+) diff --git a/marc-record/lib/MARC/File/USMARC.pm b/marc-record/lib/MARC/File/USMARC.pm index e7ee857..4077ee1 100644 --- a/marc-record/lib/MARC/File/USMARC.pm +++ b/marc-record/lib/MARC/File/USMARC.pm @@ -71,6 +71,9 @@ sub _next { # remove illegal garbage that sometimes occurs between records $usmarc =~ s/^[ \x00\x0a\x0d\x1a]+//; + # In case we picked up some of that garbage, remove it from the *end* of the record too + $usmarc =~ s/\x1d[ \x00\x0a\x0d\x1a]+$/\x1d/; + return $usmarc; }