From c18eb944d98ef444baab0fd77f9bb6e3d6a70643 Mon Sep 17 00:00:00 2001 From: ldss-jm Date: Tue, 12 Feb 2019 08:05:59 -0500 Subject: [PATCH 1/2] Scope matches to desc_id/doc when desc_id passed. --- blake_superfast.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/blake_superfast.py b/blake_superfast.py index e46509a..3242902 100644 --- a/blake_superfast.py +++ b/blake_superfast.py @@ -1,4 +1,16 @@ #!/usr/bin/env python + +""" +Exports text fragments from matching documents from a Superfastmatch API. + +Standard usage: blake_superfast.py + +Scoped usage: blake_superfast.py [desc_id] + e.g.: blake_superfast.py jerusalem.e.illbk.85 +Looks for matches only between the desc_id document and (all) other documents. +""" + +import sys import csv import time import simplejson as json @@ -364,9 +376,21 @@ def same_matrix(self, doc, otherdoc): outfile = 'blake_superfast_matches.csv' matrix_relations_file = 'blake-relations.csv' print('Exporting matches/fragments to: ' + outfile) + + # If a command line argument is given, take it to be a desc_id, and + # look find matches only between that document/desc_id and other documents. + if len(sys.argv) > 2: + raise ValueError("Too many arguments passed from the command line.") + elif len(sys.argv) == 2: + desc_id = sys.argv[1] + print('Finding matches only for documents with desc_id: ' + desc_id) + iterator = [doc for doc in API.documents() if doc.desc_id == desc_id] + else: + iterator = API.documents() + try: API.export_fragments( - outfile, matrix_csv_path=matrix_relations_file + outfile, iterator=iterator, matrix_csv_path=matrix_relations_file ) except FileNotFoundError: print('Exclude/matrix_relations file not found. Not excluding matches ' From be731280ff69630444a7c7849400dc9166062dd2 Mon Sep 17 00:00:00 2001 From: ldss-jm Date: Fri, 15 Feb 2019 08:06:58 -0500 Subject: [PATCH 2/2] Scope matches to doc(s) that begin with passed desc_id/fragments --- blake_superfast.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/blake_superfast.py b/blake_superfast.py index 3242902..c885f8f 100644 --- a/blake_superfast.py +++ b/blake_superfast.py @@ -5,9 +5,16 @@ Standard usage: blake_superfast.py -Scoped usage: blake_superfast.py [desc_id] +Scoped usage: blake_superfast.py [desc_id_fragment] +Only considers matches if one of the documents begins with desc_id_fragment e.g.: blake_superfast.py jerusalem.e.illbk.85 -Looks for matches only between the desc_id document and (all) other documents. + Finds matches between jerusalem.e.illbk.85 and any other document + e.g.: blake_superfast.py jerusalem.e + Find matches between any jerusalem.e document and any other document +Documents must begin with the given desc_id_fragment, not merely contain it. +So, `blake_superfast.py e.illbk` would not consider "jerusalem.e.illbk.*" a +matching document. + """ import sys @@ -377,14 +384,21 @@ def same_matrix(self, doc, otherdoc): matrix_relations_file = 'blake-relations.csv' print('Exporting matches/fragments to: ' + outfile) - # If a command line argument is given, take it to be a desc_id, and - # look find matches only between that document/desc_id and other documents. + # If a command line argument is given, take it to be a desc_id fragment, + # and find matches only between 1) documents that begin with that desc_id + # fragment and 2) other documents if len(sys.argv) > 2: raise ValueError("Too many arguments passed from the command line.") elif len(sys.argv) == 2: desc_id = sys.argv[1] - print('Finding matches only for documents with desc_id: ' + desc_id) - iterator = [doc for doc in API.documents() if doc.desc_id == desc_id] + print('Finding matches only for documents with ' + 'desc_id beginning: ' + desc_id) + iterator = [doc for doc in API.documents() + if doc.desc_id.startswith(desc_id)] + if not iterator: + print('No documents have desc_id beginning: ' + desc_id) + print('Exiting.') + sys.exit() else: iterator = API.documents()