From 4d035d93212171bae88df1189c65d4f6c7399b42 Mon Sep 17 00:00:00 2001
From: anlausch <anne-lauscher@web.de>
Date: Mon, 17 Dec 2018 09:40:51 +0100
Subject: [PATCH] Update converter for handling missing resources

---
 scripts/converter/converter.js | 432 ++++++++++++++++++++++++++++++++-
 1 file changed, 428 insertions(+), 4 deletions(-)

diff --git a/scripts/converter/converter.js b/scripts/converter/converter.js
index 9e6f6f0..a1e85ae 100644
--- a/scripts/converter/converter.js
+++ b/scripts/converter/converter.js
@@ -10,6 +10,8 @@ const BibliographicResourceOld = require('./oldSchema/bibliographicResource.js')
 const fs = require('fs');
 const enums = require('./../../api/schema/enum.json');
 
+const assert = require('assert');
+
 const mongoose = require('mongoose');
 
 // TODO: Check on everything related to journal issues again
@@ -643,15 +645,18 @@ function convertAufsatzsammlung() {
 function typeAnalysis(path){
     var data_old = require(path);
     var groups = groupBy(data_old, "type");
-    console.log(groups.length);
     var overview = []
     for(var group of groups){
+        console.log(group[0].type)
         overview.push([group[0], group[1], group[4]]);
         var group_string = JSON.stringify(group, null, 2);
-        fs.writeFileSync("./scripts/converter/analysis/ty_" + group[0].type + ".txt", group_string);
+        if(group[0].type == "JOURNAL" || group[0].type == "MONOGRAPH"){
+            fs.writeFileSync("./scripts/converter/analysis_finding_missing/ty_missing_" + group[0].type + ".txt", group_string);
+        }
+        fs.writeFileSync("./scripts/converter/analysis_finding_missing/ty_" + group[0].type + ".txt", group_string);
     }
     overview = JSON.stringify(overview, null, 2);
-    fs.writeFileSync("./scripts/converter/analysis/overview.txt", overview);
+    fs.writeFileSync("./scripts/converter/analysis_finding_missing/overview_2.txt", overview);
 
 }
 
@@ -1815,6 +1820,422 @@ function convertNothing(){
 }
 
 
+function convertJOURNAL(){
+    // total 315
+    // grouping by containerTitle gives 137 groups, in the first group we have only one with partOf filled and 4 are journals
+    // grouping by partOf gives 5 groups --> 249 without partOf, 2 singletons, 9 linked to acta sociologica and 55 american behavioral scientist
+
+    var articles = JSON.parse(fs.readFileSync('./scripts/converter/analysis_finding_missing/ty_missing_JOURNAL.txt'));
+    var groups = groupBy(articles, "containerTitle");
+    var groupsPartOf = groupBy(articles, "partOf");
+
+    var leftovers = [];
+
+    var journals = [];
+    var journalArticles = [];
+
+    for(var br of articles) {
+        if (!br.partOf && !br.containerTitle && !br.number) {
+            /**
+             * here the actual journals
+             */
+            var journal = new BibliographicResource({
+                journal_identifiers: br.identifiers,
+                journal_title: br.title,
+                _id: br._id,
+                status: br.status, // Eventually we need to map this somehow
+                parts: br.parts,
+                type: enums.resourceType.journal,
+                cites: br.cites,
+                partOf: br.partOf,
+            });
+            journals.push([journal])
+        } else if (br.containerTitle && !br.partOf) {
+            /**
+             * here unlinked journalArticles
+             */
+            var childType = enums.resourceType.journalArticle;
+            var parentType = enums.resourceType.journalIssue;
+
+            var volume = br.number.split("(")[0];
+            var issue = br.number.split("(")[1] ? br.number.split("(")[1].split(")")[0] : "";
+            var firstPage = br.number.split("pp. ")[1] ? br.number.split("pp. ")[1].split("-")[0] : "";
+            var lastPage = br.number.split("pp. ")[1] ? br.number.split("pp. ")[1].split("-")[1] : "";
+
+            if(br.number.split("Autumn")[1] && firstPage == "" && lastPage == ""){
+                volume = br.number.split("),")[1].split(" (")[0] ? br.number.split("),")[1].split(" (")[0] : "";
+                issue = br.number.split(") ")[1].split(", ")[0] ? br.number.split(") ")[1].split(", ")[0] : "";
+                firstPage = br.number.split("S. ")[1] ? br.number.split("S. ")[1].split("-")[0] : "";
+                lastPage = br.number.split("S. ")[1] ? br.number.split("S. ")[1].split("-")[1] : "";
+            }
+
+            var status = br.status;
+
+            var child = new BibliographicResource({
+                _id: br._id,
+                status: status, // Eventually we need to map this somehow
+                parts: br.parts,
+                type: childType,
+                cites: br.cites,
+                partOf: br.partOf
+            });
+
+            child.setTitleForType(childType, br.title);
+            child.setSubtitleForType(childType, br.subtitle);
+            child.setEditionForType(childType, br.edition);
+            child.setContributorsForType(childType, br.contributors);
+            child.setPublicationDateForType(childType, br.publicationYear);
+
+            if(firstPage != "" && lastPage != ""){
+                child.setResourceEmbodimentsForType(childType, [new ResourceEmbodiment({
+                    firstPage: firstPage,
+                    lastPage: lastPage
+                })]);
+            }else{
+                child.setResourceEmbodimentsForType(childType, [new ResourceEmbodiment(br.embodiedAs[0])]);
+            }
+
+            var parent = new BibliographicResource({type: parentType, status: status});
+            parent.setTitleForType(enums.resourceType.journal, br.containerTitle);
+
+            parent.journalIssue_number = issue;
+            parent.journalVolume_number = volume;
+
+            // now we split the identifiers according to the hierarchy
+            var parentIdentifiers = [];
+            var childIdentifiers = [];
+
+            for (var identifier of br.identifiers) {
+                // if there exists a series identifier, than the series should be the parent independent
+                // of the resource type
+                if (identifier.scheme == enums.identifier.issn) {
+                    parentIdentifiers.push(identifier);
+                    // if there exists a zdb identifier, than it should be the parent identifier
+                } else if (identifier.scheme == enums.identifier.zdbId) {
+                    parentIdentifiers.push(identifier);
+                } else if (identifier.scheme == enums.identifier.swbPpn || identifier.scheme == "PPN") {
+                    identifier.scheme = enums.identifier.swbPpn;
+                    parentIdentifiers.push(identifier);
+                } else if (identifier.scheme == enums.identifier.isbn) {
+                    parentIdentifiers.push(identifier);
+                } else {
+                    // everything else should belong to the child
+                    childIdentifiers.push(identifier);
+                }
+            }
+            child.setIdentifiersForType(childType, childIdentifiers);
+            parent.setIdentifiersForType(parentType, parentIdentifiers);
+
+            parent._id = mongoose.Types.ObjectId().toString();
+            child.partOf = parent._id;
+            journalArticles.push([child, parent]);
+        }else if (!br.partOf && !br.containerTitle && br.number) {
+            /**
+             * article is not linked and has no container title, but a number
+             * 3 resources, which just should be transformed to parent + child
+             */
+
+            childType = enums.resourceType.journalArticle;
+            parentType = enums.resourceType.journalIssue;
+
+            volume = br.number.split('(')[0] ? br.number.split('(')[0] : "";
+            if(br.number.indexOf('S.')>-1){
+                year = br.number.split('(')[1] && br.number.split('(')[1].split(')')[0] ? br.number.split('(')[1].split(')')[0] : "";
+                issue = br.number.split('(')[1] && br.number.split('(')[1].split('), ')[1].split(',')[0] ? br.number.split('(')[1].split('), ')[1].split(',')[0] : "";
+                firstPage = br.number.split('(')[1] && br.number.split('S. ')[1].split('-')[0] ? br.number.split('S. ')[1].split('-')[0] : "";
+                lastPage = br.number.split('(')[1] && br.number.split('S. ')[1].split('-')[1] ? br.number.split('S. ')[1].split('-')[1] : "";
+            }else{
+                issue = br.number.split('(')[1] ?  br.number.split('(')[1].split(')')[0] : "";
+            }
+
+            status = br.status;
+
+            child = new BibliographicResource({
+                _id: br._id,
+                status: status, // Eventually we need to map this somehow
+                parts: br.parts,
+                type: childType,
+                cites: br.cites,
+                partOf: br.partOf
+            });
+
+            child.setTitleForType(childType, br.title);
+            child.setSubtitleForType(childType, br.subtitle);
+            child.setEditionForType(childType, br.edition);
+            child.setContributorsForType(childType, br.contributors);
+            child.setPublicationDateForType(childType, br.publicationYear);
+            if(firstPage != "" && lastPage != ""){
+                child.setResourceEmbodimentsForType(childType, [new ResourceEmbodiment({
+                    firstPage: firstPage,
+                    lastPage: lastPage
+                })]);
+            }else{
+                child.setResourceEmbodimentsForType(childType, [new ResourceEmbodiment(br.embodiedAs[0])]);
+            }
+            parent = new BibliographicResource({type: parentType, status: status});
+            parent.setTitleForType(enums.resourceType.journal, br.containerTitle);
+            parent.journalIssue_number = issue;
+            parent.journalVolume_number = volume;
+
+            // now we split the identifiers according to the hierarchy
+            parentIdentifiers = [];
+            childIdentifiers = [];
+
+            for (identifier of br.identifiers) {
+                // if there exists a series identifier, than the series should be the parent independent
+                // of the resource type
+                if (identifier.scheme == enums.identifier.issn) {
+                    parentIdentifiers.push(identifier);
+                    // if there exists a zdb identifier, than it should be the parent identifier
+                } else if (identifier.scheme == enums.identifier.zdbId) {
+                    parentIdentifiers.push(identifier);
+                } else if (identifier.scheme == enums.identifier.swbPpn || identifier.scheme == "PPN") {
+                    identifier.scheme = enums.identifier.swbPpn;
+                    parentIdentifiers.push(identifier);
+                } else if (identifier.scheme == enums.identifier.isbn) {
+                    parentIdentifiers.push(identifier);
+                } else {
+                    // everything else should belong to the child
+                    childIdentifiers.push(identifier);
+                }
+            }
+            child.setIdentifiersForType(childType, childIdentifiers);
+            parent.setIdentifiersForType(parentType, parentIdentifiers);
+
+            parent._id = mongoose.Types.ObjectId().toString();
+            child.partOf = parent._id;
+
+            journalArticles.push([child, parent]);
+            //leftovers.push([child, parent]);
+        }else if((br.containerTitle && br.partOf) || (!br.containerTitle && br.partOf)){
+            /**
+             * article is linked and has container title
+             * or article is linked and has no container title
+             */
+            // Did we forget anything? yes: the ones linked with container title
+            childType = enums.resourceType.journalArticle;
+
+            status = br.status;
+
+            child = new BibliographicResource({
+                _id: br._id,
+                status: status, // Eventually we need to map this somehow
+                parts: br.parts,
+                type: childType,
+                cites: br.cites,
+                partOf: br.partOf
+            });
+
+            child.setTitleForType(childType, br.title);
+            child.setSubtitleForType(childType, br.subtitle);
+            child.setEditionForType(childType, br.edition);
+            //child.setNumberForType(type, this.number);
+
+            child.setContributorsForType(childType, br.contributors);
+            child.setPublicationDateForType(childType, br.publicationYear);
+
+
+            child.setResourceEmbodimentsForType(childType, [new ResourceEmbodiment(br.embodiedAs[0])]);
+
+            if(br.number){
+                // no resource has any issue or volume information
+                volume = br.number.split("Vol. ")[1].split(",")[0];
+                issue = br.number.split("Vol. ")[1].split(",")[1].split("No. ")[1].split(" (")[0];
+                var year = br.number.split("Vol. ")[1].split(",")[1].split("No. ")[1].split(" (")[1].split(")")[0];
+                firstPage = br.number.split("Vol. ")[1].split(",")[2].split(" p. ")[1].split("-")[0];
+                lastPage = br.number.split("Vol. ")[1].split(",")[2].split(" p. ")[1].split("-")[1];
+
+                parentType = enums.resourceType.journalIssue;
+                parent = new BibliographicResource({type: parentType, status: status});
+                parent.setTitleForType(enums.resourceType.journal, br.containerTitle);
+
+                parent.journalIssue_number = issue;
+                parent.journalVolume_number = volume;
+
+                // now we split the identifiers according to the hierarchy
+                parentIdentifiers = [];
+                childIdentifiers = [];
+
+                for (var identifier of br.identifiers) {
+                    // if there exists a series identifier, than the series should be the parent independent
+                    // of the resource type
+                    if (identifier.scheme == enums.identifier.issn) {
+                        parentIdentifiers.push(identifier);
+                        // if there exists a zdb identifier, than it should be the parent identifier
+                    } else if (identifier.scheme == enums.identifier.zdbId) {
+                        parentIdentifiers.push(identifier);
+                    } else if (identifier.scheme == enums.identifier.swbPpn || identifier.scheme == "PPN") {
+                        identifier.scheme = enums.identifier.swbPpn;
+                        parentIdentifiers.push(identifier);
+                    } else if (identifier.scheme == enums.identifier.isbn) {
+                        parentIdentifiers.push(identifier);
+                    } else {
+                        // everything else should belong to the child
+                        childIdentifiers.push(identifier);
+                    }
+                }
+                parentType = enums.resourceType.journalIssue;
+                child.setIdentifiersForType(childType, childIdentifiers);
+                parent.setIdentifiersForType(parentType, parentIdentifiers);
+
+                parent._id = mongoose.Types.ObjectId().toString();
+                child.setPublicationDateForType(childType, year);
+                child.partOf = parent._id;
+                journalArticles.push([child, parent]);
+            }else{
+                child.setIdentifiersForType(childType, br.identifiers);
+                child.partOf = br.partOf;
+                journalArticles.push([child]);
+            }
+        }else{
+            // did we forget anything?
+            // NO :)
+            leftovers.push(br)
+        }
+    }
+
+    assert(leftovers.length == 0)
+
+    // Now we need to remove the duplicates
+    groups = groupBy(journalArticles.flatten(), "journal_title");
+
+    var articles_cleaned = [];
+    var duplicate_group = [];
+    for(var g of groups){
+        console.log(g[0].journal_title);
+        var volumeGroups = groupBy(g, "journalVolume_number");
+        for(var vg of volumeGroups){
+            var issueGroups = groupBy(vg, "journalIssue_number");
+            for(var ig of issueGroups){
+                // TODO: If it is of type journal and the identifiers are the same, then anyways we want to remove the duplicates
+                if(ig[0].journal_title && ig[0].type == enums.resourceType.journalIssue && ig.length > 1){
+                    // here we have our duplicates
+                    var corruptIdentifiers = [];
+                    for(var br of ig){
+                        corruptIdentifiers.push(br._id);
+                    }
+                    duplicate_group.push(corruptIdentifiers);
+                }
+            }
+        }
+
+    }
+    // select one of them, set all partOfs to that and delete the other two
+    for(var br of journalArticles.flatten()){
+        for(var corruptIdentifiers of duplicate_group){
+            if(br.partOf && corruptIdentifiers.indexOf(br.partOf)> -1 && br.partOf != corruptIdentifiers[0]){
+                br.partOf = corruptIdentifiers[0];
+                break;
+            }
+
+        }
+    }
+    for(var br of journalArticles.flatten()){
+        for(var corruptIdentifiers of duplicate_group){
+            var is_duplicate = false;
+            if(br._id != corruptIdentifiers[0] && corruptIdentifiers.indexOf(br._id)> -1){
+                is_duplicate = true;
+                break;
+            }
+        }
+        if(!is_duplicate){
+            articles_cleaned.push(br);
+        }
+    }
+
+    console.log(articles_cleaned.flatten().length);
+    articles_cleaned = JSON.stringify(articles_cleaned, null, 2);
+    fs.writeFileSync("./scripts/converter/analysis_finding_missing/ty_missing_JOURNAL_NEU.txt", articles_cleaned);
+}
+
+function convertMONOGRAPH(){
+    var monographs = JSON.parse(fs.readFileSync('./scripts/converter/analysis_finding_missing/ty_missing_MONOGRAPH.txt'));
+    // type is 'Monograph' for each of them
+    // grouping by container title shows that most of them have none: only one monograph in the last group is problematic regarding that
+    // partOf does not cause any trouble here
+    var groups = groupBy(monographs, "containerTitle");
+    var groupsPartOf = groupBy(monographs, "partOf");
+    var monographs_neu = [];
+
+    groups[0] = groups[0].concat(groups[1]);
+    for(var br of groups[0]){
+        var type = enums.resourceType.monograph;
+        var status = br.status;
+
+        var child = new BibliographicResource({
+            _id: br._id,
+            status: status, // Eventually we need to map this somehow
+            parts: br.parts,
+            type: type,
+            cites: br.cites
+        });
+
+        child.setTitleForType(type, br.title);
+        child.setSubtitleForType(type, br.subtitle);
+        child.setEditionForType(type, br.edition);
+        child.setContributorsForType(type, br.contributors);
+        child.setPublicationDateForType(type, br.publicationYear);
+        child.setIdentifiersForType(type, br.identifiers);
+
+        monographs_neu.push([child]);
+    }
+
+    // this is a book chapter in an edited book
+    for(var br of groups[2]){
+        var childType = enums.resourceType.bookChapter;
+        var parentType = enums.resourceType.editedBook;
+        var firstPage = 13;
+        var lastPage = 31;
+        var year = 2006;
+        var status = br.status;
+
+        var child = new BibliographicResource({
+            _id: br._id,
+            status: status, // Eventually we need to map this somehow
+            parts: br.parts,
+            type: childType,
+            cites: br.cites
+        });
+
+        child.setTitleForType(childType, br.title);
+        child.setSubtitleForType(childType, br.subtitle);
+        child.setEditionForType(childType, br.edition);
+        child.setContributorsForType(childType, br.contributors);
+        child.setPublicationDateForType(childType, year);
+
+        var parent = new BibliographicResource({type: parentType, status: status});
+
+        parent.setTitleForType(parentType, br.containerTitle);
+        child.setResourceEmbodimentsForType(childType, [new ResourceEmbodiment({firstPage: firstPage, lastPage: lastPage})]);
+
+
+        var parentIdentifiers = [];
+        var childIdentifiers = [];
+        for (var identifier of br.identifiers) {
+            if (identifier.scheme == enums.identifier.swbPpn) {
+                parentIdentifiers.push(identifier);
+            }
+            else{
+                childIdentifiers.push(identifier);
+            }
+        }
+
+        child.setIdentifiersForType(childType, childIdentifiers);
+        parent.setIdentifiersForType(parentType, parentIdentifiers);
+        parent._id = mongoose.Types.ObjectId().toString();
+        child.partOf = parent._id;
+        monographs_neu.push([child, parent]);
+    }
+
+
+    console.log(monographs_neu.flatten().length);
+    monographs_neu = JSON.stringify(monographs_neu, null, 2);
+    fs.writeFileSync("./scripts/converter/analysis_finding_missing/ty_missing_MONOGRAPH_NEU.txt", monographs_neu);
+}
+
+
+
+
 function convert(){
     convertCOLLECTION();
     convertAufsatzsammlung();
@@ -1902,7 +2323,8 @@ function analyzeResult(){
     fs.writeFileSync("./scripts/converter/analysis/all_NEU.txt", all);
 }
 
-analyzeResult()
+//analyzeResult()
+
 //convert();
 //typeAnalysis('./bibliographicResources.json');
 //convertCOLLECTION();
@@ -1919,5 +2341,7 @@ analyzeResult()
 //convertUndefined()
 //convertNothing()
 //convert('./bibliographicResources.json');
+//convertJOURNAL();
+convertMONOGRAPH();