diff --git a/Vrindha K/VRINDHA K-Letter.docx b/Vrindha K/VRINDHA K-Letter.docx new file mode 100644 index 0000000..bf16621 Binary files /dev/null and b/Vrindha K/VRINDHA K-Letter.docx differ diff --git a/Vrindha K/VRINDHA_K (1).ipynb b/Vrindha K/VRINDHA_K (1).ipynb new file mode 100644 index 0000000..d00231e --- /dev/null +++ b/Vrindha K/VRINDHA_K (1).ipynb @@ -0,0 +1,3467 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "9d4bcf8c5a55439c84d5a0bde70c744f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_412b3fe20a164c5b81dc4d8aeffa8cf7", + "IPY_MODEL_7e135ef02cd24e9b8839b2b3b88b23a7", + "IPY_MODEL_2e9b5d12ff89430fbaef3967654094b6" + ], + "layout": "IPY_MODEL_dbdad554fa58487a81eed5d5fbe89a6e" + } + }, + "412b3fe20a164c5b81dc4d8aeffa8cf7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3cc1570fbbff4c89b795839c8d6c8999", + "placeholder": "​", + "style": "IPY_MODEL_f5cc938ab74f4e628ac905bbe7cb518b", + "value": "config.json: 100%" + } + }, + "7e135ef02cd24e9b8839b2b3b88b23a7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f5d7fc869be94ded884e2cfccbddce47", + "max": 473, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_73cd44b42b7c461e9b2fdce897584358", + "value": 473 + } + }, + "2e9b5d12ff89430fbaef3967654094b6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_83edd892a06f4047842caac589a75d2c", + "placeholder": "​", + "style": "IPY_MODEL_e3c288868e8a48d28b80d13907cfc893", + "value": " 473/473 [00:00<00:00, 25.7kB/s]" + } + }, + "dbdad554fa58487a81eed5d5fbe89a6e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3cc1570fbbff4c89b795839c8d6c8999": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f5cc938ab74f4e628ac905bbe7cb518b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f5d7fc869be94ded884e2cfccbddce47": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "73cd44b42b7c461e9b2fdce897584358": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "83edd892a06f4047842caac589a75d2c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e3c288868e8a48d28b80d13907cfc893": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6a7c4a58ed904b20bfe9cbe09b1c140c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_bb93223533e046eca711fa044bdf9e6a", + "IPY_MODEL_7b0255225d7b41978edddc5b5335880a", + "IPY_MODEL_79facb9c77744d9d82fbbcc58ddd64c1" + ], + "layout": "IPY_MODEL_64ad4db983a146d992e840604e3bb4ab" + } + }, + "bb93223533e046eca711fa044bdf9e6a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4244957b204c4bfb8a81c7ac30f04c93", + "placeholder": "​", + "style": "IPY_MODEL_c29652c828a44133a01ed3a6bf0ad5f6", + "value": "model.safetensors: 100%" + } + }, + "7b0255225d7b41978edddc5b5335880a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_664bb25395894a0cb8781bc88fe7a62b", + "max": 260782156, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_56f0d0cfa54f4777b86397c21367436a", + "value": 260782156 + } + }, + "79facb9c77744d9d82fbbcc58ddd64c1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_94d977008a9e45f6ab4f9d24c92d14bc", + "placeholder": "​", + "style": "IPY_MODEL_a76b4fc4363f462d8ad85076f6f3456b", + "value": " 261M/261M [00:03<00:00, 73.6MB/s]" + } + }, + "64ad4db983a146d992e840604e3bb4ab": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4244957b204c4bfb8a81c7ac30f04c93": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c29652c828a44133a01ed3a6bf0ad5f6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "664bb25395894a0cb8781bc88fe7a62b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "56f0d0cfa54f4777b86397c21367436a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "94d977008a9e45f6ab4f9d24c92d14bc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a76b4fc4363f462d8ad85076f6f3456b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "44c58296ffe84c9d8a0596addce8c33c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_452417200c604ce49d611e303bc475b4", + "IPY_MODEL_1c5e337de2be4074a7b49c98289fe3a2", + "IPY_MODEL_92e272277bc044258d620110c12335e5" + ], + "layout": "IPY_MODEL_e1896758b7d84641a53d274aad9a95b6" + } + }, + "452417200c604ce49d611e303bc475b4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_57e13a42c8ab44fda4dc5de24bada8e6", + "placeholder": "​", + "style": "IPY_MODEL_4235036ac2b04deebdc3c8bc595dc1c8", + "value": "tokenizer_config.json: 100%" + } + }, + "1c5e337de2be4074a7b49c98289fe3a2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4015b5266672449c9e4e4ab75d47cc7a", + "max": 29, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1ffafbfd71474c30bb5986519f90ced1", + "value": 29 + } + }, + "92e272277bc044258d620110c12335e5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b64e9e58c56a4784a8f0ecfd301c70b6", + "placeholder": "​", + "style": "IPY_MODEL_29f02020e6bc4ed5933af401fdfec0dc", + "value": " 29.0/29.0 [00:00<00:00, 1.71kB/s]" + } + }, + "e1896758b7d84641a53d274aad9a95b6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "57e13a42c8ab44fda4dc5de24bada8e6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4235036ac2b04deebdc3c8bc595dc1c8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4015b5266672449c9e4e4ab75d47cc7a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1ffafbfd71474c30bb5986519f90ced1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "b64e9e58c56a4784a8f0ecfd301c70b6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "29f02020e6bc4ed5933af401fdfec0dc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fed76e90cabd4934855b17663234ee30": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c3e71ab34c004a098c27a74168694501", + "IPY_MODEL_21ff46e928a44376ae71a9ac4bd0cc29", + "IPY_MODEL_a819a12417374ef78ddfd5bd593489d6" + ], + "layout": "IPY_MODEL_2e6088cfea294357b384074bd891f51a" + } + }, + "c3e71ab34c004a098c27a74168694501": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3bfa05025686446dac3ca24b7a440020", + "placeholder": "​", + "style": "IPY_MODEL_23ad4f64c24048d190991f81557e928b", + "value": "vocab.txt: 100%" + } + }, + "21ff46e928a44376ae71a9ac4bd0cc29": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ec0baa60dbf2466b917ec5cfe924bba6", + "max": 213450, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_46a054dbaa0047dfac9ee1c83c80ec67", + "value": 213450 + } + }, + "a819a12417374ef78ddfd5bd593489d6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4cd73c4b70e34430ac65af8fc09cc612", + "placeholder": "​", + "style": "IPY_MODEL_3ce52afc52c14be28722a289d2c2e931", + "value": " 213k/213k [00:00<00:00, 536kB/s]" + } + }, + "2e6088cfea294357b384074bd891f51a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3bfa05025686446dac3ca24b7a440020": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "23ad4f64c24048d190991f81557e928b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ec0baa60dbf2466b917ec5cfe924bba6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "46a054dbaa0047dfac9ee1c83c80ec67": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4cd73c4b70e34430ac65af8fc09cc612": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3ce52afc52c14be28722a289d2c2e931": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ee9781a2879d404f834150670b67a185": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_55ce919f6c3842758dae89f3fbab7227", + "IPY_MODEL_71ca4e1427384c64ae1a3ad51817da74", + "IPY_MODEL_44bf5c35b17e4b9f8bb0783f55e4d044" + ], + "layout": "IPY_MODEL_0466058321ef4dbf92ebcff8fa5c1856" + } + }, + "55ce919f6c3842758dae89f3fbab7227": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dd4f516cbcdc4c7a83da69ceba0556eb", + "placeholder": "​", + "style": "IPY_MODEL_2729f08944764336b94ccecccfb7bd59", + "value": "tokenizer.json: 100%" + } + }, + "71ca4e1427384c64ae1a3ad51817da74": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2cba95e38ac44b0393755c69dedd8747", + "max": 435797, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_696197a54b4b45e2999a80da54c9de2b", + "value": 435797 + } + }, + "44bf5c35b17e4b9f8bb0783f55e4d044": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a15ecd477f414c9bbce20e54435b75a3", + "placeholder": "​", + "style": "IPY_MODEL_337c10a736a944cfa4f0453d6edca5b7", + "value": " 436k/436k [00:00<00:00, 733kB/s]" + } + }, + "0466058321ef4dbf92ebcff8fa5c1856": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dd4f516cbcdc4c7a83da69ceba0556eb": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2729f08944764336b94ccecccfb7bd59": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2cba95e38ac44b0393755c69dedd8747": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "696197a54b4b45e2999a80da54c9de2b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "a15ecd477f414c9bbce20e54435b75a3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "337c10a736a944cfa4f0453d6edca5b7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "rpkc8gITlSJu" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install PyPDF2\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Dh9J6vuRBjuB", + "outputId": "785975c7-3e37-4b96-f2d2-d9e8ceccd3c8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting PyPDF2\n", + " Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: PyPDF2\n", + "Successfully installed PyPDF2-3.0.1\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from PyPDF2 import PdfReader\n", + "from google.colab import files\n", + "\n", + "def extract_metadata(pdf_path):\n", + " with open(pdf_path, 'rb') as f:\n", + " pdf = PdfReader(f)\n", + " info = pdf.metadata\n", + " title = info.get('/Title', None)\n", + " author = info.get('/Author', None)\n", + " year = info.get('/CreationDate', None)\n", + " if year:\n", + " year = int(year[2:6])\n", + "\n", + " return {'title': title, 'author': author, 'year': year}\n", + "\n", + "uploaded = files.upload()\n", + "\n", + "for filename in uploaded.keys():\n", + " metadata = extract_metadata(filename)\n", + " print(metadata)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 91 + }, + "id": "BA_AAIkyB7yw", + "outputId": "53def981-a9b1-44fd-8785-c57fa2778b13" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving s10502-022-09396-1 (1).pdf to s10502-022-09396-1 (1).pdf\n", + "{'title': 'Introduction: challenges and prospects of born-digital and digitized archives in the digital humanities', 'author': 'Lise Jaillant ', 'year': 2022}\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from PyPDF2 import PdfReader\n", + "from google.colab import files\n", + "\n", + "def extract_text_from_pdf(pdf_path):\n", + " with open(pdf_path, 'rb') as f:\n", + " reader = PdfReader(f)\n", + " text = ''\n", + " for page_num in range(len(reader.pages)):\n", + " text += reader.pages[page_num].extract_text()\n", + " return text\n", + "\n", + "uploaded = files.upload()\n", + "\n", + "for filename in uploaded.keys():\n", + " pdf_text = extract_text_from_pdf(filename)\n", + " print(pdf_text)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "FoEA8vPWCbl9", + "outputId": "a76dc133-a298-4046-fc28-a24958b975e7" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving s10502-022-09396-1 (1).pdf to s10502-022-09396-1 (1) (1).pdf\n", + "Vol.:(0123456789)Archival Science (2022) 22:285–291\n", + "https://doi.org/10.1007/s10502-022-09396-1\n", + "1 3\n", + "EDITORIAL\n", + "Introduction: challenges and prospects of born‑digital \n", + "and digitized archives in the digital humanities\n", + "Lise Jaillant1  · Katie Aske2 · Eirini Goudarouli3 · Natasha Kitcher2\n", + "Published online: 26 May 2022 \n", + "© The Author(s), under exclusive licence to Springer Nature B.V. 2022, corrected publication 2022\n", + "The scale and complexity of digital archives, both born-digital and digitized, are \n", + "posing enormous challenges for both researchers and memory institutions. In the \n", + "world of archives, these new types of records are fundamentally changing the land-\n", + "scape as well as the role of archivists and archival institutions. The emergence of \n", + "new generation technologies also brings a variety of complexities and challenges \n", + "to archival frameworks, requiring new capabilities and approaches on how best to \n", + "capture, preserve, contextualize and present the increasingly born-digital and digi-\n", + "tized records. However, this technological shift also brings new opportunities for \n", + "research and experimentation (Goudarouli et al. 2019). For example, technological \n", + "developments have transformed the way researchers can access and explore archival \n", + "collections. The digitization of archival materials has opened a variety of large-scale \n", + "digital collections to the world. Additionally, born-digital archives are beginning to \n", + "reach terabytes, comprising many different types of media, that can be made acces-\n", + "sible online.\n", + "By enabling the extraction of archival content as data and moving towards the \n", + "creation of aggregated large-scale datasets, memory institutions are focusing on pro-\n", + "viding access to their collections in new ways inviting new explorations and interpre-\n", + "tations of their materials by researchers from across the world. In addition, today’s \n", + "reading rooms have been transformed to become more user-friendly than they were \n", + " * Lise Jaillant \n", + " l.jaillant@lboro.ac.uk\n", + " Katie Aske \n", + " K.L.Aske@lboro.ac.uk\n", + " Eirini Goudarouli \n", + " eirini.goudarouli@nationalarchives.gov.uk\n", + " Natasha Kitcher \n", + " N.Kitcher@lboro.ac.uk\n", + "1 School of Social Sciences and Humanities, Loughborough University, Epinal Way, \n", + "Loughborough LE11 3TU, UK\n", + "2 Loughborough University, Loughborough, UK\n", + "3 The National Archives UK, London, UK286 Archival Science (2022) 22:285–291\n", + "1 3\n", + "in the 1970s and 1980s, as online finding aids and search tools have replaced card \n", + "catalogues, making it easier to identify materials. Digital cameras allow users to \n", + "take their own pictures instead of relying on archival repositories for photocopying; \n", + "and laptops have enabled the exploration of online collections remotely and away \n", + "from the physical reading rooms. Twentieth-century historians, literary scholars and \n", + "other humanities researchers are still exploring old papers in archival collections, as \n", + "the collections that have been digitized only represent a small portion of all archival \n", + "holdings, but they are also increasingly focusing on exploring new types of born-\n", + "digital records such as web archives and social media archives.\n", + "However, too often, born-digital and digitized materials are inaccessible due to \n", + "copyright reasons. For instance, copyrighted texts are not available for download \n", + "from HathiTrust, a not-for-profit collaborative of libraries preserving 17 + million \n", + "digitized items (including c. 61% not in the public domain). Copyright reasons also \n", + "largely explain why web archives collected by major libraries (including the British \n", + "Library and the Bibliothèque Nationale de France) cannot be put online. To con-\n", + "sult archival webpages that were once publicly available, users often need to travel \n", + "to the repositories. Other types of archives born in digital forms, such as emails, \n", + "Word documents, digital pictures and video files, can also be difficult to access due \n", + "to copyright – but also privacy concerns, and technical issues. Emails have largely \n", + "replaced letters, and yet, researchers who need to consult archival emails will very \n", + "rarely be able to access these born-digital records.\n", + "This special issue explores the current challenges and prospects of born-digital \n", + "and digitized archives for the digital humanities, focusing particularly on the topic \n", + "of access. It brings together experts from archival science and the humanities, with \n", + "experts and practitioners from cultural heritage institutions. It is a key research out-\n", + "put of the AURA (Archives in the UK/ Republic of Ireland & AI) network, funded \n", + "by the Arts and Humanities Research Council (AHRC) in the UK and the Irish \n", + "Research Council.1 The AURA network was designed to unlock cultural assets \n", + "that are preserved in digital archives, closed to the public, or difficult to access. By \n", + "bringing together digital humanists, computer scientists, and stakeholders (including \n", + "policymakers), the network aimed to find solutions to the problem of inaccessible \n", + "records in digital archives. To explore the challenge of access to digital archives, \n", + "cross-disciplinary collaborations are absolutely essential.\n", + "The big challenges of our time, from global warming to social inequalities, can-\n", + "not be solved within a single discipline. The same applies to inaccessible archives: \n", + "we cannot expect archivists or digital humanists alone to find a magical solution \n", + "that will instantly make digital records more accessible. Instead, we need to set up \n", + "collaborations across disciplines that seldom talk to each other. Until recently, the \n", + "scholarship on digitized and born-digital records originated from the archive sector \n", + "and focused primarily on preservation. There were few examples of digital human-\n", + "ists who sat at the same table and took part in these discussions. In 2010, Matthew \n", + "Kirschenbaum, an American professor of Digital Humanities, co-authored a report \n", + "on ‘Digital Forensics and Born-Digital Content in Cultural Heritage Collections,’ \n", + "1 www. aura- netwo rk. net.287\n", + "1 3 Archival Science (2022) 22:285–291 \n", + "with professionals in libraries and archives (Kirschenbaum et al. 2010). This report \n", + "then led to a partnership that developed the BitCurator system2 now widely used \n", + "by digital archivists. Ten years later, Ryan Cordell published a report in partnership \n", + "with the Library of Congress on machine learning (ML) in the library sector. The \n", + "report mentioned that “access to data is the single greatest practical hurdle to more \n", + "Machine Learning work in libraries” (Cordell 2020, p 50).\n", + "The challenge of access is at the center of the new US/UK network AEOLIAN \n", + "(AI for Cultural Organizations),3 which complements the AURA network and its \n", + "research outcomes. This special issue focuses on problems of access to born-digital \n", + "and digitized archives in the digital humanities (DH), both from the infrastructural \n", + "and users’ perspectives. But accessing archival collections is not enough. The arti-\n", + "cles presented here also highlight the need to use innovative AI-based methodolo-\n", + "gies (such as Natural Language Processing or Linked Data) to support research. It \n", + "is also essential to develop partnerships between digital humanists, computer sci-\n", + "entists, and cultural heritage professionals to fully explore new ways to approach \n", + "digitized and born-digital archives.\n", + "Questions explored in this special issue include: How can we increase use by \n", + "digital humanities scholars of born-digital and digitized archives? How can we \n", + "give greater access to collections important to digital humanists that are currently \n", + "restricted? Collectively, the articles in the special issue problematize the challenges \n", + "and prospects of digital and born-digital archives. They offer new theoretical inter -\n", + "pretations, apply research methodologies to new case studies, and offer innovative \n", + "perspectives on present and future archival digital collections.\n", + "We invited contributions from interdisciplinary voices and received responses \n", + "from digital humanities scholars, emerging academics, trained librarians, and archi-\n", + "vists. While looking at a broad range of issues, from social media use to Python \n", + "notebooks, Handwritten Text Recognition to the semantic web, these contributions \n", + "examine similar themes as they seek to make digital archives accessible through a \n", + "range of new theoretical frameworks and practical tools. The initial contributions to \n", + "this special issue consider the subject of accessibility and other ethical challenges \n", + "faced by digital archives. The second section focuses on new computational tools for \n", + "archives.\n", + "Our first article, ‘Digital critical archives, copyright, and feminist praxis’ by \n", + "Claire Battershill et al. look at the challenge of creating equal archives. With a spe-\n", + "cial focus on twentieth-century publishing history, Battershill introduces the Mod-\n", + "ernist Archives Publishing Project (alongside Helena Clarkson, Matthew N. Hannah, \n", + "Ilya Nokhrin, Elizabeth Willson Gordon, and Nicola Wilson, all from the project \n", + "team). The article acknowledges that archives of different sizes face different chal-\n", + "lenges when trying to set up collections. Smaller archives face greater challenges, \n", + "as their digitization processes depend on what scholarly research gets funding, so \n", + "unequal practices can trickle-down and impact what is or is not available for users \n", + "to view. The Modernist Archives Publishing Project has emerged in this context, \n", + "2 https:// bitcu rator. net.\n", + "3 www. aeoli an- netwo rk. net.288 Archival Science (2022) 22:285–291\n", + "1 3\n", + "looking to make the selection process when digitizing objects clear to users. The \n", + "aim is to “create a digital archive that embodies feminist principles at all levels of \n", + "practice.” The first phase of the project ran from 2012 to 2020 and started by look -\n", + "ing at records associated with Hogarth Press, Leonard, and Virginia Woolf’s pub-\n", + "lishing house. The authors note that copyright was a considerable barrier to making \n", + "this collection available online since much of the material is from the twentieth cen-\n", + "tury. To overcome this challenge, it was possible to gain permission to digitize from \n", + "the legal owners, but this meant that physically digitizing the archives was just as \n", + "time-consuming as gaining clearance to do so.\n", + "In ‘Archives, linked data and the digital humanities: increasing access to digitized \n", + "and born digital archives via the Semantic Web’, Ashleigh Hawkins also consid-\n", + "ers the huge amount of work that needs to be done behind the scenes to digitize \n", + "archives, as well as the great value this process can have. The article is an introduc-\n", + "tion to linked data, looking at the benefits of, and current industry barriers to, archi-\n", + "val linked data, and the future directions for DH as data. This includes the possible \n", + "incorporation of AI, which has already started to take place, for example in the use \n", + "of the open-source tool ePADD for enabling access to email collections. Hawkins \n", + "notes that where digital data has been produced, it is not always readily available \n", + "because of issues such as intellectual property. For this data to be made machine-\n", + "readable and accessible, archivists need to be involved in the conversation. Hawkins \n", + "argues that while the infrastructure for archival linked data is emerging, archivists \n", + "need to participate in the production of these tools so that the metadata, content, and \n", + "other context are of a high enough standard for the semantic web. Hawkins is one \n", + "of many in this special issue who calls for an interdisciplinary approach to digital \n", + "archives.\n", + "Furthering the discussion of accessibility and the ethical challenges presented by \n", + "privacy issues is ‘A survey on email visualisation research to address the conflict \n", + "between privacy and access’ by Zoe Bartliff, Yunhyong Kim and Frank Hopfgartner. \n", + "This paper explores the perpetual cycle between email data access and concerns for \n", + "privacy and private information held within email archives. Where Bartliff et  al. \n", + "express the need to make emails more accessible data sources, they also address \n", + "the seemingly impossible balance between access and privacy—a need not met by \n", + "current visualisations for researching email data. The article proposes a categorisa-\n", + "tion of email visualisation attributes and a graded scale, as a means to identify the \n", + "extent to which privacy conscious data management can impact research on email \n", + "collections.\n", + "No matter what the archive is, or how big it is, granting users access in a way that \n", + "clearly shows how and why some records are available while others are not, is a key \n", + "challenge for digital archives. Sustainable ways to maintain and create archives need \n", + "to be identified, and articles in this special issue illuminate several ways this could \n", + "be done. A cross-disciplinary approach seems almost inevitable given the status of \n", + "the digital humanities as a tech-focused study of historical concern. Once the issue \n", + "of access to archives has been overcome, new computational tools can be developed \n", + "to either help explore these archives or complete the digitization themselves. Two \n", + "examples are presented in this special issue: Handwritten text recognition technol-\n", + "ogy (Transkribus), and Python Notebooks. Both tools come with their own unique 289\n", + "1 3 Archival Science (2022) 22:285–291 \n", + "set of prospects and challenges, showing that while the future of AI in archives may \n", + "be bright, there is still a need to scrutinize how and why we use digital archives \n", + "throughout the process.\n", + "Joseph Nockels et al. take a close look at Transkribus, a software that has been \n", + "used to speed up primary source transcription for digitization, in their article \n", + "‘Understanding the application of handwritten text recognition technology in herit-\n", + "age contexts: a systematic review of Transkribus in published research’. Transkri-\n", + "bus was originally funded as an EU Horizon 2020 project which launched as an \n", + "online tool in 2015 and is now a pay-to-use HTR technology. Nockels conducted \n", + "a study of all articles (not just peer-reviewed journals) that have used Transkribus \n", + "between 2015 and 2020. The authors found that Transkribus lends itself to a variety \n", + "of studies, and this is only getting more eclectic as time goes on (and the article flags \n", + "a recent branching out into botany). They also note that while the tool originated \n", + "in academia, its use may go well beyond that, and there is a need to monitor its \n", + "impact. This can be done with open forums and peer support to assist and continu-\n", + "ously remind users of the need for caution when using the tool. As with many digital \n", + "projects, handwritten text recognition in general and Tranksribus, in particular, have \n", + "benefitted from Covid-19, which increased a focus on digital projects in cultural her -\n", + "itage institutions. This paper is the first review of Transkribus in published research, \n", + "although if use of the technology continues to increase at the rate it has in the last \n", + "five years, it will probably not be the last.\n", + "Technology can make digitized sources more accessible, and it can also lead to \n", + "new knowledge–which is the topic of Leontien Talboom and Mark Bell’s ‘Keep-\n", + "ing it under lock and keywords: exploring new ways to open up the web archives \n", + "with Notebooks’. This article looks at how Python Notebooks can be used to take \n", + "users beyond the keyword search, with Notebooks serving as supporting tools that \n", + "aid search work while giving users a deeper look into their archive sources. The \n", + "article looks at two Notebooks used to search the UK Government Web Archive \n", + "(UKGWA). Notebook 1 focuses on available metadata, providing a view of records \n", + "not available through the UKGWA, while Notebook 2 takes the researcher to the \n", + "next step–crawling through UKGWA on behalf of the user and extracting relevant \n", + "content from pages to present an overview of what is available. Notebooks can be \n", + "used to showcase datasets for researchers, but the authors admit they may not be sus-\n", + "tainable long-term as their use depends on the availability of cloud platforms. The \n", + "authors encourage institutions to think about the ethical issues around computational \n", + "methods, and again emphasize the need to keep users involved in conversations \n", + "about the tools used to select and study historical data. Nonetheless, Notebooks are \n", + "(for now) a good tool to use when accessing archives.\n", + "The special issue closes with an article from Lise Jaillant, ‘How can we make \n", + "born-digital and digitised archives more accessible? Identifying obstacles and solu-\n", + "tions’. As a literary scholar and digital humanist, Jaillant addresses the key issues \n", + "faced by researchers when attempting to access “dark” digital collections. Based on \n", + "a series of interviews with archival and library professionals in the UK, Ireland, and \n", + "the US, the article explores the common obstacles that often hold back develop-\n", + "ments for improving the accessibility of digital archives. The article outlines current \n", + "levels of access to digital collections: some of them are completely closed to users, 290 Archival Science (2022) 22:285–291\n", + "1 3\n", + "while others are accessible on a limited basis (for example, when digital files are \n", + "available on-site but not remotely). Jaillant  suggests possible solutions to the prob-\n", + "lems of access–including the ethical use of Artificial Intelligence to unlock “dark” \n", + "archives inaccessible to users. She proposes the creation of a global user community \n", + "who would participate in decisions on access to digital collections. The articles in \n", + "this special issue all share a similar focus on the need for interdisciplinary research \n", + "in archives, the need to include users in conversations with practitioners and schol-\n", + "ars when creating new research tools, and the need to stay vigilant regarding pos-\n", + "sible ethical downfalls in digitization. The landscape of digital archives is changing \n", + "rapidly, as these articles show in both their scope and their focus on recent and new \n", + "technologies. What emerges from this special issue is the need for future-proof solu-\n", + "tions to the issues of born-digital and digitized archives, and to keep these discus-\n", + "sions going well into the future.\n", + "Funding This study is part of the AURA project (Archives in the UK/ Republic of Ireland and AI), which \n", + "received funding from the UK Arts and Humanities Research Council (reference AH/V002341/1) and the \n", + "Irish Research Council (reference IRC/V002341/1).\n", + "References\n", + "Cordell R (2020) Machine learning + libraries. Library of Congress, Washington, D.C. https:// labs. loc. \n", + "gov/ static/ labs/ work/ repor ts/ Corde ll- LOC- ML- report. pdf? loclr= blogs ig. Accessed 13 Aug. 2021.\n", + "Goudarouli E, Sexton A, Sheridan J (2019) The challenge of the digital and the future archive: through the \n", + "lens of the national archives UK. Phil Tech 32:173–183. https:// doi. org/ 10. 1007/ s13347- 018- 0333-3\n", + "Kirschenbaum M, Ovenden R, Redwine G (2010) Digital forensics and born-digital content in cultural \n", + "heritage collections. CLIR. https:// www. clir. org/ pubs/ repor ts/ pub149/. Accessed 13 Aug. 2021\n", + "Publisher’s Note Springer Nature remains neutral with regard to jurisdictional claims in published maps \n", + "and institutional affiliations.\n", + "Lise Jaillant is Senior Lecturer (Associate Professor) in Digital Humanities at Loughborough University. \n", + "She has a background in publishing history and digital humanities. In the past five years, she has gained \n", + "expertise on born-digital archives and the issues of preservation/ access to these archives. Since 2020, she \n", + "has been UK PI for three externally funded projects on Archives and Artificial Intelligence: (1) “EyCon \n", + "(Visual AI and Early Conflict Photography)” (2) “AEOLIAN: Artificial Intelligence for Cultural Organi-\n", + "sations” (3) “AURA (Archives in the UK/ Republic of Ireland & AI): Bringing together Digital Human-\n", + "ists, Computer Scientists & stakeholders to unlock cultural assets.” These international projects aim to \n", + "make digitised and born-digital archives more accessible to researchers, and to use innovative research \n", + "methods such as AI to analyse archival data.\n", + "Katie Aske is a Research Assistant at Loughborough University working on the AURA and AEOLIAN \n", + "networks. Aske is an award-winning scholar of eighteenth-century literature and the digital humanities. \n", + "After completing her PhD at Loughborough University in 2015, Aske undertook a Postdoctoral Research \n", + "Fellowship at Université de Bretagne Occidentale in 2016, working on the digital humanities project \n", + "‘DIGITENS: Digital Encyclopaedia of Eighteenth-Century British Sociability’. She has published widely \n", + "on eighteenth-century literature and medicine and has also worked on several major research projects at 291\n", + "1 3 Archival Science (2022) 22:285–291 \n", + "Northumbria University, including the AHRC-funded Sterne Digital Library.\n", + "Eirini Goudarouli is Head of Digital Research Programmes at The National Archives. Her current \n", + "research interests focus on digital research in cultural heritage. She is particularly interested in bringing \n", + "together methods and theories from a range of disciplines that could essentially contribute to the rethink -\n", + "ing of digital, archival and collection-based research.\n", + "Eirini has extensive experience working on interdisciplinary research projects across the Cultural \n", + "Heritage and Higher Education sectors. In 2015 she received a doctorate in History of Science from \n", + "the University of Athens, and she has been a visiting scholar at the University of Cambridge and \n", + "the University of Helsinki.\n", + "Before joining The National Archives, she was a researcher at the University of Warwick and an \n", + "Associate Research Fellow at Birkbeck, London. In previous years, she spent more than five years \n", + "working with the University collections belonging to the Historical Archive and the Lab for the \n", + "Electronic Processing of historical archives at the University of Athens.\n", + "Eirini is also a Research Fellow at the Research Centre for the Humanities, Greece, a member of \n", + "the Digital Committee at the Royal Historical Society, a member of ‘Humanities and Data Science’ \n", + "special interest group at the Alan Turing Institute, and a board member of the Advanced Information \n", + "Collaboratory, an international network with partners from leading academic and cultural institutions \n", + "spanning five continents.\n", + "Natasha Kitcher previously studied History at Royal Holloway, where she completed her BA in History \n", + "and an MA in Public History. As part of the course she learned a great deal about engaging wider audi-\n", + "ences in the past and wrote a play (Mum is MAD!) that was performed at Stanley Halls in 2019.\n", + " Now focused on her doctoral research, Natasha was recently the Programme Editor and an \n", + "Online Tutor for the University of London Worldwide. She is currently the Rapporteur for an AHRC \n", + "funded project with the Science Museum considering the culture and display of space exploration \n", + "for future space galleries.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from transformers import pipeline\n", + "from PyPDF2 import PdfReader\n", + "from google.colab import files\n", + "\n", + "def extract_text_from_pdf(pdf_path):\n", + " with open(pdf_path, 'rb') as f:\n", + " reader = PdfReader(f)\n", + " text = ''\n", + " for page_num in range(len(reader.pages)):\n", + " text += reader.pages[page_num].extract_text()\n", + " return text\n", + "\n", + "def extract_information(text):\n", + " nlp = pipeline(\"question-answering\")\n", + " result = nlp(question=\"What is the study about?\", context=text)\n", + " return result['answer']\n", + "\n", + "uploaded = files.upload()\n", + "\n", + "for filename in uploaded.keys():\n", + " pdf_text = extract_text_from_pdf(filename)\n", + " study_description = extract_information(pdf_text)\n", + " print(study_description)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 396, + "referenced_widgets": [ + "9d4bcf8c5a55439c84d5a0bde70c744f", + "412b3fe20a164c5b81dc4d8aeffa8cf7", + "7e135ef02cd24e9b8839b2b3b88b23a7", + "2e9b5d12ff89430fbaef3967654094b6", + "dbdad554fa58487a81eed5d5fbe89a6e", + "3cc1570fbbff4c89b795839c8d6c8999", + "f5cc938ab74f4e628ac905bbe7cb518b", + "f5d7fc869be94ded884e2cfccbddce47", + "73cd44b42b7c461e9b2fdce897584358", + "83edd892a06f4047842caac589a75d2c", + "e3c288868e8a48d28b80d13907cfc893", + "6a7c4a58ed904b20bfe9cbe09b1c140c", + "bb93223533e046eca711fa044bdf9e6a", + "7b0255225d7b41978edddc5b5335880a", + "79facb9c77744d9d82fbbcc58ddd64c1", + "64ad4db983a146d992e840604e3bb4ab", + "4244957b204c4bfb8a81c7ac30f04c93", + "c29652c828a44133a01ed3a6bf0ad5f6", + "664bb25395894a0cb8781bc88fe7a62b", + "56f0d0cfa54f4777b86397c21367436a", + "94d977008a9e45f6ab4f9d24c92d14bc", + "a76b4fc4363f462d8ad85076f6f3456b", + "44c58296ffe84c9d8a0596addce8c33c", + "452417200c604ce49d611e303bc475b4", + "1c5e337de2be4074a7b49c98289fe3a2", + "92e272277bc044258d620110c12335e5", + "e1896758b7d84641a53d274aad9a95b6", + "57e13a42c8ab44fda4dc5de24bada8e6", + "4235036ac2b04deebdc3c8bc595dc1c8", + "4015b5266672449c9e4e4ab75d47cc7a", + "1ffafbfd71474c30bb5986519f90ced1", + "b64e9e58c56a4784a8f0ecfd301c70b6", + "29f02020e6bc4ed5933af401fdfec0dc", + "fed76e90cabd4934855b17663234ee30", + "c3e71ab34c004a098c27a74168694501", + "21ff46e928a44376ae71a9ac4bd0cc29", + "a819a12417374ef78ddfd5bd593489d6", + "2e6088cfea294357b384074bd891f51a", + "3bfa05025686446dac3ca24b7a440020", + "23ad4f64c24048d190991f81557e928b", + "ec0baa60dbf2466b917ec5cfe924bba6", + "46a054dbaa0047dfac9ee1c83c80ec67", + "4cd73c4b70e34430ac65af8fc09cc612", + "3ce52afc52c14be28722a289d2c2e931", + "ee9781a2879d404f834150670b67a185", + "55ce919f6c3842758dae89f3fbab7227", + "71ca4e1427384c64ae1a3ad51817da74", + "44bf5c35b17e4b9f8bb0783f55e4d044", + "0466058321ef4dbf92ebcff8fa5c1856", + "dd4f516cbcdc4c7a83da69ceba0556eb", + "2729f08944764336b94ccecccfb7bd59", + "2cba95e38ac44b0393755c69dedd8747", + "696197a54b4b45e2999a80da54c9de2b", + "a15ecd477f414c9bbce20e54435b75a3", + "337c10a736a944cfa4f0453d6edca5b7" + ] + }, + "id": "MIV5PnpODM4e", + "outputId": "44a81a78-37ff-4b70-bb0c-b9200aad83ef" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).\n", + "Using a pipeline without specifying a model name and revision in production is not recommended.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving s10502-022-09396-1 (1).pdf to s10502-022-09396-1 (1) (2).pdf\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "config.json: 0%| | 0.00/473 [00:00" + ], + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving s10502-022-09396-1 (1).pdf to s10502-022-09396-1 (1) (3).pdf\n", + "Inserted article ID: 1\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import sqlite3\n", + "\n", + "database = 'articles.db'\n", + "conn = sqlite3.connect(database)\n", + "\n", + "def show_table(conn, table_name):\n", + " cursor = conn.cursor()\n", + " cursor.execute(f\"SELECT * FROM {table_name}\")\n", + " rows = cursor.fetchall()\n", + " for row in rows:\n", + " print(row)\n", + "\n", + "show_table(conn, 'articles')\n", + "\n", + "conn.close()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KpCxaVBJE2fr", + "outputId": "56e55b75-8eaa-4e42-cea3-0f62e6f2186b" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(1, 'Sample Title', 'John Doe', 2023)\n", + "(2, 'Introduction: challenges and prospects of born-digital and digitized archives in the digital humanities', 'Lise Jaillant ', 2022)\n", + "(3, 'Introduction: challenges and prospects of born-digital and digitized archives in the digital humanities', 'Lise Jaillant ', 2022)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import sqlite3\n", + "\n", + "# Function to create a database connection\n", + "def create_connection(db_file):\n", + " conn = None\n", + " try:\n", + " conn = sqlite3.connect(db_file)\n", + " return conn\n", + " except sqlite3.Error as e:\n", + " print(e)\n", + " return conn\n", + "\n", + "# Function to create a table in the database\n", + "def create_table(conn, create_table_sql):\n", + " try:\n", + " c = conn.cursor()\n", + " c.execute(create_table_sql)\n", + " except sqlite3.Error as e:\n", + " print(e)\n", + "\n", + "# Connect to the database\n", + "database = 'articles.db'\n", + "conn = create_connection(database)\n", + "\n", + "# Define the SQL statement to create the articles table\n", + "create_table_sql = \"\"\"\n", + "CREATE TABLE IF NOT EXISTS articles (\n", + " id INTEGER PRIMARY KEY,\n", + " title TEXT NOT NULL,\n", + " year INTEGER,\n", + " journal TEXT,\n", + " authors TEXT,\n", + " abstract TEXT,\n", + " introduction TEXT,\n", + " methodology TEXT,\n", + " results TEXT,\n", + " conclusions TEXT,\n", + " keywords TEXT\n", + ");\n", + "\"\"\"\n", + "\n", + "# Create the articles table\n", + "if conn is not None:\n", + " create_table(conn, create_table_sql)\n", + " print(\"Table created successfully.\")\n", + "else:\n", + " print(\"Error! Cannot create the database connection.\")\n", + "\n", + "# Close the database connection\n", + "conn.close()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "q-Enmn9HKvo3", + "outputId": "7eed29f6-f6b8-4524-983a-8bd41364f9c9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Table created successfully.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import sqlite3\n", + "\n", + "# Function to create a database connection\n", + "def create_connection(db_file):\n", + " conn = None\n", + " try:\n", + " conn = sqlite3.connect(db_file)\n", + " return conn\n", + " except sqlite3.Error as e:\n", + " print(e)\n", + " return conn\n", + "\n", + "# Function to create a table in the database\n", + "def create_table(conn, create_table_sql):\n", + " try:\n", + " c = conn.cursor()\n", + " c.execute(create_table_sql)\n", + " except sqlite3.Error as e:\n", + " print(e)\n", + "\n", + "# Function to display the schema of the database table\n", + "def display_table_schema(conn, table_name):\n", + " cursor = conn.cursor()\n", + " cursor.execute(f\"PRAGMA table_info({table_name})\")\n", + " rows = cursor.fetchall()\n", + " print(\"Table Schema:\")\n", + " for row in rows:\n", + " print(row)\n", + "\n", + "# Function to display the contents of the database table\n", + "def display_table_contents(conn, table_name):\n", + " cursor = conn.cursor()\n", + " cursor.execute(f\"SELECT * FROM {table_name}\")\n", + " rows = cursor.fetchall()\n", + " print(\"\\nTable Contents:\")\n", + " for row in rows:\n", + " print(row)\n", + "\n", + "# Connect to the database\n", + "database = 'articles.db'\n", + "conn = create_connection(database)\n", + "\n", + "# Define the SQL statement to create the articles table\n", + "create_table_sql = \"\"\"\n", + "CREATE TABLE IF NOT EXISTS articles (\n", + " id INTEGER PRIMARY KEY,\n", + " title TEXT NOT NULL,\n", + " year INTEGER,\n", + " journal TEXT,\n", + " authors TEXT,\n", + " abstract TEXT,\n", + " introduction TEXT,\n", + " methodology TEXT,\n", + " results TEXT,\n", + " conclusions TEXT,\n", + " keywords TEXT\n", + ");\n", + "\"\"\"\n", + "\n", + "# Create the articles table\n", + "if conn is not None:\n", + " create_table(conn, create_table_sql)\n", + " print(\"Table created successfully.\")\n", + "else:\n", + " print(\"Error! Cannot create the database connection.\")\n", + "\n", + "# Display the table schema\n", + "display_table_schema(conn, 'articles')\n", + "\n", + "# Close the database connection\n", + "conn.close()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FrFuApEmK5Zx", + "outputId": "1e64df11-b005-4907-ca9d-b507ea140463" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Table created successfully.\n", + "Table Schema:\n", + "(0, 'id', 'INTEGER', 0, None, 1)\n", + "(1, 'title', 'TEXT', 1, None, 0)\n", + "(2, 'author', 'TEXT', 1, None, 0)\n", + "(3, 'year', 'INTEGER', 0, None, 0)\n" + ] + } + ] + } + ] +} \ No newline at end of file