stephenleo
diff --git a/‎README.md‎
Lines changed: 11 additions & 3 deletions b/‎README.md‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎notebook/stripnet.html‎
Lines changed: 0 additions & 108 deletions b/‎notebook/stripnet.html‎
Lines changed: 0 additions & 108 deletions
diff --git a/‎notebook/data.csv‎ ‎notebooks/data.csv‎notebook/data.csv renamed to notebooks/data.csv b/‎notebook/data.csv‎ ‎notebooks/data.csv‎notebook/data.csv renamed to notebooks/data.csv
diff --git a/‎notebook/test.ipynb‎ ‎notebooks/minimal_working_example.ipynb‎notebook/test.ipynb renamed to notebooks/minimal_working_example.ipynb
Lines changed: 38 additions & 68 deletions b/‎notebook/test.ipynb‎ ‎notebooks/minimal_working_example.ipynb‎notebook/test.ipynb renamed to notebooks/minimal_working_example.ipynb
Lines changed: 38 additions & 68 deletions
diff --git a/‎notebooks/stripnet.html‎
Lines changed: 108 additions & 0 deletions b/‎notebooks/stripnet.html‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 3 additions & 2 deletions b/‎setup.py‎
Lines changed: 3 additions & 2 deletions
@@ -7,6 +7,13 @@ Leverage the power of NLP Topic Modeling, Semantic Similarity and Network analys
 - Please leave a ⭐ to let me know it has been useful to you so that I can dedicate more of my time working on it.
 
 ## Install
+- Highly recommend to install in a conda environment
+```
+conda create -n stripnet python=3.8 jupyterlab -y
+conda activate stripnet
+```
+
+- Pip install this library
 ```
 pip install stripnet
 ```
@@ -47,7 +54,7 @@ stripnet.fit_transform(data['text'])
 - The plot is fully interactive too! Hovering over any bar shows the relevant information of the paper.
 
 ```
-stripnet.most_important()
+stripnet.most_important_docs()
 ```
 
 ![Most Important Text](https://github.com/stephenleo/stripnet/blob/main/images/centrality.png?raw=true "Most Important Papers")
@@ -63,5 +70,6 @@ STriP Net stands on the shoulder of giants and several prior work. The most nota
 # Buy me a coffee
 If this work helped you in any way, please consider the following way to give me feedback so I can spend more time on this project
 1. ⭐ this repository
-2. ❤️ [the Huggingface space ](https://huggingface.co/spaces/stephenleo/strip)
-3. ☕ [Buy me a Coffee!](https://www.buymeacoffee.com/stephenleo)
+2. ❤️ [the Huggingface space ](https://huggingface.co/spaces/stephenleo/strip) (Coming Jan 11 2022!)
+3. 👏 [the Medium post](https://stephen-leo.medium.com/) (Coming End Jan 2022!)
+4. ☕ [Buy me a Coffee!](https://www.buymeacoffee.com/stephenleo)
@@ -94,23 +94,23 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-01-05 20:42:12 INFO: Load pretrained SentenceTransformer: allenai-specter\n",
-      "2022-01-05 20:42:36 INFO: Use pytorch device: cuda\n",
-      "2022-01-05 20:42:36 INFO: Missing data detected. Dropping them\n",
-      "2022-01-05 20:42:36 INFO: ========== Step1: Calculating Embeddings ==========\n",
-      "Batches: 100%|██████████| 3/3 [00:02<00:00,  1.17it/s]\n",
-      "2022-01-05 20:42:41 INFO: ========== Step2: Topic modeling ==========\n",
-      "2022-01-05 20:42:41 INFO: Initializing the topic model\n",
-      "2022-01-05 20:42:41 INFO: Training the topic model\n",
-      "2022-01-05 20:42:50,425 - BERTopic - Reduced dimensionality with UMAP\n",
-      "2022-01-05 20:42:50,437 - BERTopic - Clustered UMAP embeddings with HDBSCAN\n",
-      "2022-01-05 20:42:50 INFO: Populating Topic Results\n",
-      "2022-01-05 20:42:50 INFO: ========== Step3: STriP Network ==========\n",
-      "2022-01-05 20:42:50 INFO: Cosine similarity\n",
-      "2022-01-05 20:42:50 INFO: Calculating optimal threshold\n",
-      "2022-01-05 20:42:50 INFO: Number of connections: 126\n",
-      "2022-01-05 20:42:50 INFO: Calculating Network Plot\n",
-      "2022-01-05 20:42:50 INFO: ========== Model Fit Successfully! ==========\n"
+      "2022-01-06 12:16:44 INFO: Load pretrained SentenceTransformer: allenai-specter\n",
+      "2022-01-06 12:17:07 INFO: Use pytorch device: cuda\n",
+      "2022-01-06 12:17:07 INFO: Missing data detected. Dropping them\n",
+      "2022-01-06 12:17:07 INFO: ========== Step1: Calculating Embeddings ==========\n",
+      "Batches: 100%|██████████| 3/3 [00:02<00:00,  1.11it/s]\n",
+      "2022-01-06 12:17:12 INFO: ========== Step2: Topic modeling ==========\n",
+      "2022-01-06 12:17:12 INFO: Initializing the topic model\n",
+      "2022-01-06 12:17:12 INFO: Training the topic model\n",
+      "2022-01-06 12:17:21,291 - BERTopic - Reduced dimensionality with UMAP\n",
+      "2022-01-06 12:17:21,304 - BERTopic - Clustered UMAP embeddings with HDBSCAN\n",
+      "2022-01-06 12:17:21 INFO: Populating Topic Results\n",
+      "2022-01-06 12:17:21 INFO: ========== Step3: STriP Network ==========\n",
+      "2022-01-06 12:17:21 INFO: Cosine similarity\n",
+      "2022-01-06 12:17:21 INFO: Calculating optimal threshold\n",
+      "2022-01-06 12:17:21 INFO: Number of connections: 126\n",
+      "2022-01-06 12:17:21 INFO: Calculating Network Plot\n",
+      "2022-01-06 12:17:21 INFO: ========== Model Fit Successfully! ==========\n"
      ]
     },
     {
@@ -139,7 +139,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-01-05 20:07:00 INFO: Calculating Network Centrality\n"
+      "2022-01-06 12:18:23 INFO: Calculating Network Centrality\n"
      ]
     },
     {
@@ -161,6 +161,9 @@
           [
            "Want To Reduce Labeling Cost? GPT-3 Can<br>Help<br><br>Data annotation is a time-consuming<br>and labor-intensive process for many NLP tasks.<br>Although there exist various methods to produce<br>pseudo data labels, they are often task-specific<br>and require a decent amount of labeled data to<br>start with. Recently, the immense language model<br>GPT-3 with 175 billion parameters has achieved<br>tremendous improvement across many few-shot<br>learning tasks. In this paper, we explore ways to<br>..."
           ],
+          [
+           "FNet: Mixing Tokens with Fourier<br>Transforms<br><br>We show that Transformer encoder<br>architec-tures can be massively sped up, with<br>limited accuracy costs, by replacing the self-<br>attention sublayers with simple linear<br>transformations that \"mix\" input tokens. These<br>linear transformations , along with simple<br>nonlinearities in feed-forward layers, are<br>sufficient to model semantic relationships in<br>several text classification tasks. Perhaps most<br>surprisingly, we find that ..."
+          ],
           [
            "Neural Machine Translation of Rare Words with<br>Subword Units<br><br>Neural machine translation<br>(NMT) models typically operate with a fixed<br>vocabulary , but translation is an open-vocabulary<br>problem. Previous work addresses the translation<br>of out-of-vocabulary words by backing off to a<br>dictionary. In this paper , we introduce a simpler<br>and more effective approach, making the NMT model<br>capable of open-vocabulary translation by encoding<br>rare and unknown words as sequences ..."
           ]
@@ -183,13 +186,15 @@
           0.10673251529415916,
           0.09934333324744282,
           0.07184737801176154,
+          0.05168250442223043,
           0.042019416676950916
          ],
          "xaxis": "x",
          "y": [
           "5",
           "24",
           "59",
+          "8",
           "7"
          ],
          "yaxis": "y"
@@ -200,9 +205,6 @@
           [
            "An Image is Worth 16x16 Words: Transformers for<br>Image Recognition at Scale<br><br>While the<br>Transformer architecture has become the de-facto<br>standard for natural language processing tasks,<br>its applications to computer vision remain<br>limited. In vision, attention is either applied in<br>conjunction with convolutional networks, or used<br>to replace certain components of convolutional<br>networks while keeping their overall structure in<br>place. We show that this reliance on CNNs is..."
           ],
-          [
-           "Unsupervised Data Augmentation for Consistency<br>Training<br><br>Semi-supervised learning lately<br>has shown much promise in improving deep learning<br>models when labeled data is scarce. Common among<br>recent approaches is the use of consistency<br>training on a large amount of unlabeled data to<br>constrain model predictions to be invariant to<br>input noise. In this work, we present a new<br>perspective on how to effectively noise unlabeled<br>examples and argue that the quality of noising..."
-          ],
           [
            "The 2021 Image Similarity Dataset and<br>Challenge<br><br>This paper introduces a new<br>benchmark for large-scale image similarity<br>detection. This benchmark is used for the Image<br>Similarity Challenge at NeurIPS'21 (ISC2021). The<br>goal is to determine whether a query image is a<br>modified copy of any image in a reference corpus<br>of size 1~million. The benchmark features a<br>variety of image transformations such as automated<br>transformations, hand-crafted image edits and<br>machine-..."
           ],
@@ -213,31 +215,29 @@
            "Learning Transferable Visual Models From Natural<br>Language Supervision<br><br>State-of-the-art<br>computer vision systems are trained to predict a<br>fixed set of predetermined object categories. This<br>restricted form of supervision limits their<br>generality and usability since additional labeled<br>data is needed to specify any other visual<br>concept. Learning directly from raw text about<br>images is a promising alternative which leverages<br>a much broader source of supervision. We<br>d..."
           ]
          ],
-         "hovertemplate": "Topic_Name=image, vision, learning, visual<br>Betweenness Centrality=%{x}<br>index=%{y}<br>Text=%{customdata[0]}<extra></extra>",
-         "legendgroup": "image, vision, learning, visual",
+         "hovertemplate": "Topic_Name=image, learning, contrastive, vision<br>Betweenness Centrality=%{x}<br>index=%{y}<br>Text=%{customdata[0]}<extra></extra>",
+         "legendgroup": "image, learning, contrastive, vision",
          "marker": {
           "color": "#EF553B",
           "pattern": {
            "shape": ""
           }
          },
-         "name": "image, vision, learning, visual",
-         "offsetgroup": "image, vision, learning, visual",
+         "name": "image, learning, contrastive, vision",
+         "offsetgroup": "image, learning, contrastive, vision",
          "orientation": "h",
          "showlegend": true,
          "textposition": "auto",
          "type": "bar",
          "x": [
           0.07375869019704637,
-          0.06015364679748239,
           0.059237319511292116,
           0.053904344657769304,
           0.03623643212684308
          ],
          "xaxis": "x",
          "y": [
           "57",
-          "18",
           "42",
           "1",
           "34"
@@ -248,29 +248,29 @@
          "alignmentgroup": "True",
          "customdata": [
           [
-           "FNet: Mixing Tokens with Fourier<br>Transforms<br><br>We show that Transformer encoder<br>architec-tures can be massively sped up, with<br>limited accuracy costs, by replacing the self-<br>attention sublayers with simple linear<br>transformations that \"mix\" input tokens. These<br>linear transformations , along with simple<br>nonlinearities in feed-forward layers, are<br>sufficient to model semantic relationships in<br>several text classification tasks. Perhaps most<br>surprisingly, we find that ..."
+           "Unsupervised Data Augmentation for Consistency<br>Training<br><br>Semi-supervised learning lately<br>has shown much promise in improving deep learning<br>models when labeled data is scarce. Common among<br>recent approaches is the use of consistency<br>training on a large amount of unlabeled data to<br>constrain model predictions to be invariant to<br>input noise. In this work, we present a new<br>perspective on how to effectively noise unlabeled<br>examples and argue that the quality of noising..."
           ]
          ],
-         "hovertemplate": "Topic_Name=image, matching, similarity, copy<br>Betweenness Centrality=%{x}<br>index=%{y}<br>Text=%{customdata[0]}<extra></extra>",
-         "legendgroup": "image, matching, similarity, copy",
+         "hovertemplate": "Topic_Name=learning, image, titles, product<br>Betweenness Centrality=%{x}<br>index=%{y}<br>Text=%{customdata[0]}<extra></extra>",
+         "legendgroup": "learning, image, titles, product",
          "marker": {
           "color": "#00cc96",
           "pattern": {
            "shape": ""
           }
          },
-         "name": "image, matching, similarity, copy",
-         "offsetgroup": "image, matching, similarity, copy",
+         "name": "learning, image, titles, product",
+         "offsetgroup": "learning, image, titles, product",
          "orientation": "h",
          "showlegend": true,
          "textposition": "auto",
          "type": "bar",
          "x": [
-          0.05168250442223043
+          0.06015364679748239
          ],
          "xaxis": "x",
          "y": [
-          "8"
+          "18"
          ],
          "yaxis": "y"
         }
@@ -1138,36 +1138,7 @@
     }
    ],
    "source": [
-    "stripnet.most_important()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['bertopic==0.9.4',\n",
-       " 'networkx==2.6.3',\n",
-       " 'numpy==1.22.0',\n",
-       " 'pandas==1.3.5',\n",
-       " 'plotly==5.5.0',\n",
-       " 'pyvis==0.1.9',\n",
-       " 'scikit_learn==1.0.2',\n",
-       " 'sentence_transformers==2.1.0',\n",
-       " 'setuptools==58.0.4']"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import pathlib\n",
-    "pathlib.Path(\"../requirements.txt\").read_text().splitlines()"
+    "stripnet.most_important_docs()"
    ]
   },
   {
@@ -1183,7 +1154,7 @@
    "hash": "165d1ae889830a583229da7bcb4f0175182080283a5d782889056a279531f3b2"
   },
   "kernelspec": {
-   "display_name": "Python 3.8.12 64-bit ('stripnet': conda)",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -1198,9 +1169,8 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.8.12"
-  },
-  "orig_nbformat": 4
+  }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
@@ -7,3 +7,4 @@ pyvis==0.1.9
 scikit_learn==1.0.2
 sentence_transformers==2.1.0
 setuptools==58.0.4
+ipywidgets==7.6.5
@@ -12,11 +12,12 @@
                      'plotly==5.5.0',
                      'pyvis==0.1.9',
                      'scikit_learn==1.0.2',
-                     'sentence_transformers==2.1.0']
+                     'sentence_transformers==2.1.0',
+                     'ipywidgets==7.6.5']
 
 setuptools.setup(
     name="stripnet",
-    version="0.0.4",
+    version="0.0.5",
     author="stephenleo",
     author_email="stephen.leo87@gmail.com",
     description="STriP Net: Semantic Similarity of Scientific Papers (S3P) Network",