From 645cf8ee97ca87bf40df00518e3af3055474f644 Mon Sep 17 00:00:00 2001 From: "H@di" Date: Fri, 19 Feb 2021 15:46:58 +0330 Subject: [PATCH 01/19] prettify README --- README.md | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index a503542..78a0d35 100644 --- a/README.md +++ b/README.md @@ -1,38 +1,44 @@ -#LargeVis +# LargeVis + This is the *official* implementation of the **LargeVis** model by the original authors, which is used to visualize large-scale and high-dimensional data [(Tang, Liu, Zhang and Mei)](https://arxiv.org/abs/1602.00370). It now supports visualizing both high-dimensional feature vectors and networks. The package also contains a very efficient algorithm for constructing K-nearest neighbor graph (K-NNG). Contact person: Jian Tang, tangjianpku@gmail.com. This work is done when the author is in Microsoft Research Asia. -##Install +## Install + Both C++ source codes and Python wrapper are provided on Linux, OS X and Windows. To install the package, external packages are required, including [GSL (GNU Scientific Library)](http://www.gnu.org/software/gsl/) on Linux and OS X or [BOOST](http://www.boost.org/) on Windows for generating random numbers. -####Linux +#### Linux + Compile the source files via: -``` +```bash g++ LargeVis.cpp main.cpp -o LargeVis -lm -pthread -lgsl -lgslcblas -Ofast -march=native -ffast-math ``` To install the Python wrapper, modify ```setup.py``` to make sure that the GSL path is correctly set and then run ```sudo python setup.py install```. -####OS X +#### OS X + Install gsl using [Homebrew](http://brew.sh/): -``` +```bash brew install gsl ``` Modify line 347 of ```annoylib.h``` to change ```lseek64``` to ```lseek```. Then compile the source files (in the Linux folder) via: -``` +```bash g++ LargeVis.cpp main.cpp -o LargeVis -lm -pthread -lgsl -lgslcblas -Ofast -march=native -ffast-math -L/usr/local/lib -I/usr/local/include ``` To install the Python wrapper, run ```sudo python setup.py install```. -####Windows +#### Windows + To compile the source files, use Microsoft Visual Studio, where you need to set the BOOST path. To install the Python wrapper, modify ```setup.py``` to make sure that the BOOST path is correctly set and then run ```python setup.py install```. -##Usage +## Usage + LargeVis is suitable for visualizing both high-dimensional feature vectors and networks. For high-dimensional feature vectors, the format of input file should be as follows: the first line specifies the number of feature vectors and the dimensionality (500 vectors with 10 dimensions in the following example), and each of the next 500 lines describes one feature vector with 10 float numbers. ``` 500 10 @@ -55,11 +61,11 @@ For networks, each line of the input file is a DIRECTED edge. For each undirecte 495 498 1.5 ``` For C++ executable file, -``` +```bash ./LargeVis -input -output ``` or for Python, -``` +```bash python LargeVis_run.py -input -output ``` @@ -79,11 +85,12 @@ Besides the two parameters, other optional parameters include: * `-gamma`: The weights assigned to negative edges. Default is 7. * `-perp`: The perplexity used for deciding edge weights in K-NNG. Default is 50. -##Examples +## Examples + We provide some examples including MNIST(high-dimensional feature vectors) and CondMat(networks) in the ```Examples/``` folder. For example, to visualize the MNIST dataset, -``` +```bash python LargeVis_run.py -input mnist_vec784D.txt -output mnist_vec2D.txt -threads 16 python plot.py -input mnist_vec2D.txt -label mnist_label.txt -output mnist_vec2D_plot ``` @@ -91,8 +98,10 @@ python plot.py -input mnist_vec2D.txt -label mnist_label.txt -output mnist_vec2D ![plot of mnist](Examples/MNIST/mnist_plot.png) Please cite the following paper if you use LargeVis to visualize your data. -##Citation -``` + +## Citation + +```bibtex @inproceedings{tang2016visualizing, title={Visualizing Large-scale and High-dimensional Data}, author={Tang, Jian and Liu, Jingzhou and Zhang, Ming and Mei, Qiaozhu}, @@ -102,5 +111,7 @@ Please cite the following paper if you use LargeVis to visualize your data. organization={International World Wide Web Conferences Steering Committee} } ``` -##Acknowledgement + +## Acknowledgement + Some methods of this package are from a previous work of the LargeVis authors, [LINE (Large-scale Information Network Embedding)](https://github.com/tangjianpku/LINE). From 0902ff1cca530b6e3ba4353bc91d171ca2a0af5c Mon Sep 17 00:00:00 2001 From: "H@di" Date: Fri, 19 Feb 2021 15:57:25 +0330 Subject: [PATCH 02/19] add makefile --- Linux/irun.sh | 0 Linux/makefile | 14 ++++++++++++++ 2 files changed, 14 insertions(+) mode change 100644 => 100755 Linux/irun.sh create mode 100644 Linux/makefile diff --git a/Linux/irun.sh b/Linux/irun.sh old mode 100644 new mode 100755 diff --git a/Linux/makefile b/Linux/makefile new file mode 100644 index 0000000..43ffa40 --- /dev/null +++ b/Linux/makefile @@ -0,0 +1,14 @@ +all: LargeVis + +LargeVis: LargeVis.o main.o + g++ LargeVis.o main.o -o LargeVis -lm -pthread -lgsl -lgslcblas -Ofast -march=native -ffast-math + +LargeVis.o: LargeVis.cpp LargeVis.h ANNOY/* + g++ LargeVis.cpp -c -Ofast + +main.o: main.cpp LargeVis.h ANNOY/* + g++ main.cpp -c -Ofast + +.PHONY: clean +clean: + rm -f LargeVis *.o \ No newline at end of file From 6e2ed7b2ed76bdbf25cda684c53a79f108219783 Mon Sep 17 00:00:00 2001 From: "H@di" Date: Fri, 19 Feb 2021 17:47:03 +0330 Subject: [PATCH 03/19] refactor python files --- LargeVis_run.py | 58 +++++++++++++++++---------------- plot.py | 87 ++++++++++++++++++++++++------------------------- 2 files changed, 73 insertions(+), 72 deletions(-) mode change 100644 => 100755 LargeVis_run.py mode change 100644 => 100755 plot.py diff --git a/LargeVis_run.py b/LargeVis_run.py old mode 100644 new mode 100755 index 4029e09..b85f6e1 --- a/LargeVis_run.py +++ b/LargeVis_run.py @@ -1,28 +1,30 @@ -import LargeVis -import argparse - -parser = argparse.ArgumentParser() -parser.add_argument('-fea', default = 1, type = int, help = 'whether to visualize high-dimensional feature vectors or networks') -parser.add_argument('-input', default = '', help = 'input file') -parser.add_argument('-output', default = '', help = 'output file') -parser.add_argument('-outdim', default = -1, type = int, help = 'output dimensionality') -parser.add_argument('-threads', default = -1, type = int, help = 'number of training threads') -parser.add_argument('-samples', default = -1, type = int, help = 'number of training mini-batches') -parser.add_argument('-prop', default = -1, type = int, help = 'number of propagations') -parser.add_argument('-alpha', default = -1, type = float, help = 'learning rate') -parser.add_argument('-trees', default = -1, type = int, help = 'number of rp-trees') -parser.add_argument('-neg', default = -1, type = int, help = 'number of negative samples') -parser.add_argument('-neigh', default = -1, type = int, help = 'number of neighbors in the NN-graph') -parser.add_argument('-gamma', default = -1, type = float, help = 'weight assigned to negative edges') -parser.add_argument('-perp', default = -1, type = float, help = 'perplexity for the NN-grapn') - -args = parser.parse_args() - -if args.fea == 1: - LargeVis.loadfile(args.input) -else: - LargeVis.loadgraph(args.input) - -Y = LargeVis.run(args.outdim, args.threads, args.samples, args.prop, args.alpha, args.trees, args.neg, args.neigh, args.gamma, args.perp) - -LargeVis.save(args.output) +#!/usr/bin/env python + +import LargeVis +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('-fea', default=1, type=int, help='whether to visualize high-dimensional feature vectors or networks') +parser.add_argument('-input', default='', help='input file') +parser.add_argument('-output', default='', help='output file') +parser.add_argument('-outdim', default=-1, type=int, help='output dimensionality') +parser.add_argument('-threads', default=-1, type=int, help='number of training threads') +parser.add_argument('-samples', default=-1, type=int, help='number of training mini-batches') +parser.add_argument('-prop', default=-1, type=int, help='number of propagations') +parser.add_argument('-alpha', default=-1, type=float, help='learning rate') +parser.add_argument('-trees', default=-1, type=int, help='number of rp-trees') +parser.add_argument('-neg', default=-1, type=int, help='number of negative samples') +parser.add_argument('-neigh', default=-1, type=int, help='number of neighbors in the NN-graph') +parser.add_argument('-gamma', default=-1, type=float, help='weight assigned to negative edges') +parser.add_argument('-perp', default=-1, type=float, help='perplexity for the NN-grapn') + +args = parser.parse_args() + +if args.fea == 1: + LargeVis.loadfile(args.input) +else: + LargeVis.loadgraph(args.input) + +Y = LargeVis.run(args.outdim, args.threads, args.samples, args.prop, args.alpha, args.trees, args.neg, args.neigh, args.gamma, args.perp) + +LargeVis.save(args.output) diff --git a/plot.py b/plot.py old mode 100644 new mode 100755 index a55d6c8..8d1e916 --- a/plot.py +++ b/plot.py @@ -1,44 +1,43 @@ -import numpy -import matplotlib.pyplot as plt -import argparse - -parser = argparse.ArgumentParser() - -parser.add_argument('-input', default = '', help = 'input file') -parser.add_argument('-label', default = '', help = 'label file') -parser.add_argument('-output', default = '', help = 'output file') -parser.add_argument('-range', default = '', help = 'axis range') - -args = parser.parse_args() - -label = [] -if args.label != '': - for line in open(args.label): - label.append(line.strip()) - -N = M = 0 -all_data = {} -for i, line in enumerate(open(args.input)): - vec = line.strip().split(' ') - if i == 0: - N = int(vec[0]) - M = int(vec[1]) - elif i <= N: - if args.label == '': - label.append(0) - all_data.setdefault(label[i-1], []).append((float(vec[-2]), float(vec[-1]))) - -colors = plt.cm.rainbow(numpy.linspace(0, 1, len(all_data))) - -for color, ll in zip(colors, sorted(all_data.keys())): - x = [t[0] for t in all_data[ll]] - y = [t[1] for t in all_data[ll]] - plt.plot(x, y, '.', color = color, markersize = 1) -if args.range != '': - l = abs(float(args.range)) - plt.xlim(-l, l) - plt.ylim(-l, l) -plt.savefig(args.output, dpi = 500) - - - +#!/usr/bin/env python + +import numpy +import matplotlib.pyplot as plt +import argparse + +parser = argparse.ArgumentParser() + +parser.add_argument('-input', default='', help='input file') +parser.add_argument('-label', default='', help='label file') +parser.add_argument('-output', default='', help='output file') +parser.add_argument('-range', default='', help='axis range') + +args = parser.parse_args() + +label = [] +if args.label != '': + for line in open(args.label): + label.append(line.strip()) + +N = M = 0 +all_data = {} +for i, line in enumerate(open(args.input)): + vec = line.strip().split(' ') + if i == 0: + N = int(vec[0]) + M = int(vec[1]) + elif i <= N: + if args.label == '': + label.append(0) + all_data.setdefault(label[i-1], []).append((float(vec[-2]), float(vec[-1]))) + +colors = plt.cm.rainbow(numpy.linspace(0, 1, len(all_data))) + +for color, ll in zip(colors, sorted(all_data.keys())): + x = [t[0] for t in all_data[ll]] + y = [t[1] for t in all_data[ll]] + plt.plot(x, y, '.', color=color, markersize=1) +if args.range != '': + l = abs(float(args.range)) + plt.xlim(-l, l) + plt.ylim(-l, l) +plt.savefig(args.output, dpi=500) From d0af2c060d6c45c2f06c65f0e2346bdb0236bbab Mon Sep 17 00:00:00 2001 From: "H@di" Date: Fri, 19 Feb 2021 17:52:45 +0330 Subject: [PATCH 04/19] make some args required in python files --- LargeVis_run.py | 4 ++-- plot.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/LargeVis_run.py b/LargeVis_run.py index b85f6e1..8fca98f 100755 --- a/LargeVis_run.py +++ b/LargeVis_run.py @@ -5,8 +5,8 @@ parser = argparse.ArgumentParser() parser.add_argument('-fea', default=1, type=int, help='whether to visualize high-dimensional feature vectors or networks') -parser.add_argument('-input', default='', help='input file') -parser.add_argument('-output', default='', help='output file') +parser.add_argument('-input', default='', help='input file', required=True) +parser.add_argument('-output', default='', help='output file', required=True) parser.add_argument('-outdim', default=-1, type=int, help='output dimensionality') parser.add_argument('-threads', default=-1, type=int, help='number of training threads') parser.add_argument('-samples', default=-1, type=int, help='number of training mini-batches') diff --git a/plot.py b/plot.py index 8d1e916..c8cf005 100755 --- a/plot.py +++ b/plot.py @@ -6,9 +6,9 @@ parser = argparse.ArgumentParser() -parser.add_argument('-input', default='', help='input file') +parser.add_argument('-input', default='', help='input file', required=True) parser.add_argument('-label', default='', help='label file') -parser.add_argument('-output', default='', help='output file') +parser.add_argument('-output', default='', help='output file', required=True) parser.add_argument('-range', default='', help='axis range') args = parser.parse_args() From 8f196292c76b854ef87268e2e5f99813f943f8c3 Mon Sep 17 00:00:00 2001 From: "H@di" Date: Fri, 19 Feb 2021 18:13:46 +0330 Subject: [PATCH 05/19] handle new style label file --- plot.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/plot.py b/plot.py index c8cf005..491f6da 100755 --- a/plot.py +++ b/plot.py @@ -1,8 +1,9 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 + +import argparse import numpy import matplotlib.pyplot as plt -import argparse parser = argparse.ArgumentParser() @@ -13,22 +14,19 @@ args = parser.parse_args() -label = [] +labels = {} if args.label != '': - for line in open(args.label): - label.append(line.strip()) + with open(args.label) as f: + for line in f: + node_id, label = line.strip().split() + labels[node_id] = label -N = M = 0 all_data = {} -for i, line in enumerate(open(args.input)): - vec = line.strip().split(' ') - if i == 0: - N = int(vec[0]) - M = int(vec[1]) - elif i <= N: - if args.label == '': - label.append(0) - all_data.setdefault(label[i-1], []).append((float(vec[-2]), float(vec[-1]))) +with open(args.input) as f: + _ = f.readline() # ignore first line + for line in f: + vec = line.strip().split(' ') + all_data.setdefault(labels.get(vec[0], 0), []).append((float(vec[-2]), float(vec[-1]))) colors = plt.cm.rainbow(numpy.linspace(0, 1, len(all_data))) From af8d1031c53047c8af12611bc151d5631164f221 Mon Sep 17 00:00:00 2001 From: "H@di" Date: Fri, 19 Feb 2021 18:18:55 +0330 Subject: [PATCH 06/19] change args to use double dash --- LargeVis_run.py | 26 ++++++++++++------------ Linux/main.cpp | 52 ++++++++++++++++++++++++------------------------ README.md | 34 +++++++++++++++---------------- Windows/main.cpp | 52 ++++++++++++++++++++++++------------------------ plot.py | 8 ++++---- 5 files changed, 86 insertions(+), 86 deletions(-) diff --git a/LargeVis_run.py b/LargeVis_run.py index 8fca98f..cbdb05d 100755 --- a/LargeVis_run.py +++ b/LargeVis_run.py @@ -4,19 +4,19 @@ import argparse parser = argparse.ArgumentParser() -parser.add_argument('-fea', default=1, type=int, help='whether to visualize high-dimensional feature vectors or networks') -parser.add_argument('-input', default='', help='input file', required=True) -parser.add_argument('-output', default='', help='output file', required=True) -parser.add_argument('-outdim', default=-1, type=int, help='output dimensionality') -parser.add_argument('-threads', default=-1, type=int, help='number of training threads') -parser.add_argument('-samples', default=-1, type=int, help='number of training mini-batches') -parser.add_argument('-prop', default=-1, type=int, help='number of propagations') -parser.add_argument('-alpha', default=-1, type=float, help='learning rate') -parser.add_argument('-trees', default=-1, type=int, help='number of rp-trees') -parser.add_argument('-neg', default=-1, type=int, help='number of negative samples') -parser.add_argument('-neigh', default=-1, type=int, help='number of neighbors in the NN-graph') -parser.add_argument('-gamma', default=-1, type=float, help='weight assigned to negative edges') -parser.add_argument('-perp', default=-1, type=float, help='perplexity for the NN-grapn') +parser.add_argument('--fea', default=1, type=int, help='whether to visualize high-dimensional feature vectors or networks') +parser.add_argument('--input', default='', help='input file', required=True) +parser.add_argument('--output', default='', help='output file', required=True) +parser.add_argument('--outdim', default=-1, type=int, help='output dimensionality') +parser.add_argument('--threads', default=-1, type=int, help='number of training threads') +parser.add_argument('--samples', default=-1, type=int, help='number of training mini-batches') +parser.add_argument('--prop', default=-1, type=int, help='number of propagations') +parser.add_argument('--alpha', default=-1, type=float, help='learning rate') +parser.add_argument('--trees', default=-1, type=int, help='number of rp-trees') +parser.add_argument('--neg', default=-1, type=int, help='number of negative samples') +parser.add_argument('--neigh', default=-1, type=int, help='number of neighbors in the NN-graph') +parser.add_argument('--gamma', default=-1, type=float, help='weight assigned to negative edges') +parser.add_argument('--perp', default=-1, type=float, help='perplexity for the NN-grapn') args = parser.parse_args() diff --git a/Linux/main.cpp b/Linux/main.cpp index 22abea8..b0413b6 100644 --- a/Linux/main.cpp +++ b/Linux/main.cpp @@ -24,34 +24,34 @@ int main(int argc, char **argv) long long i; if (argc < 3) { - printf("-fea: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1.\n"); - printf("-input: Input file of feature vectors or networks\n"); - printf("-output: Output file of low-dimensional representations.\n"); - printf("-threads: Number of threads. Default is 8.\n"); - printf("-outdim: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2.\n"); - printf("-samples: Number of edge samples for graph layout (in millions). Default is set to data size / 100 (million).\n"); - printf("-prop: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3.\n"); - printf("-alpha: Initial learning rate. Default is 1.0.\n"); - printf("-trees: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases.\n"); - printf("-neg: Number of negative samples used for negative sampling. Default is 5.\n"); - printf("-neigh: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150.\n"); - printf("-gamma: The weights assigned to negative edges. Default is 7.\n"); - printf("-perp: The perplexity used for deciding edge weights in K-NNG. Default is 50.\n"); + printf("--fea: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1.\n"); + printf("--input: Input file of feature vectors or networks\n"); + printf("--output: Output file of low-dimensional representations.\n"); + printf("--threads: Number of threads. Default is 8.\n"); + printf("--outdim: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2.\n"); + printf("--samples: Number of edge samples for graph layout (in millions). Default is set to data size / 100 (million).\n"); + printf("--prop: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3.\n"); + printf("--alpha: Initial learning rate. Default is 1.0.\n"); + printf("--trees: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases.\n"); + printf("--neg: Number of negative samples used for negative sampling. Default is 5.\n"); + printf("--neigh: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150.\n"); + printf("--gamma: The weights assigned to negative edges. Default is 7.\n"); + printf("--perp: The perplexity used for deciding edge weights in K-NNG. Default is 50.\n"); return 0; } - if ((i = ArgPos((char *)"-fea", argc, argv)) > 0) if_embed = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-input", argc, argv)) > 0) strcpy(infile, argv[i + 1]); - if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(outfile, argv[i + 1]); - if ((i = ArgPos((char *)"-outdim", argc, argv)) > 0) out_dim = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-samples", argc, argv)) > 0) n_samples = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) n_threads = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-neg", argc, argv)) > 0) n_negative = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-neigh", argc, argv)) > 0) n_neighbors = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-trees", argc, argv)) > 0) n_trees = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-prop", argc, argv)) > 0) n_propagation = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); - if ((i = ArgPos((char *)"-gamma", argc, argv)) > 0) n_gamma = atof(argv[i + 1]); - if ((i = ArgPos((char *)"-perp", argc, argv)) > 0) perplexity = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--fea", argc, argv)) > 0) if_embed = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--input", argc, argv)) > 0) strcpy(infile, argv[i + 1]); + if ((i = ArgPos((char *)"--output", argc, argv)) > 0) strcpy(outfile, argv[i + 1]); + if ((i = ArgPos((char *)"--outdim", argc, argv)) > 0) out_dim = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--samples", argc, argv)) > 0) n_samples = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--threads", argc, argv)) > 0) n_threads = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--neg", argc, argv)) > 0) n_negative = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--neigh", argc, argv)) > 0) n_neighbors = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--trees", argc, argv)) > 0) n_trees = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--prop", argc, argv)) > 0) n_propagation = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--gamma", argc, argv)) > 0) n_gamma = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--perp", argc, argv)) > 0) perplexity = atof(argv[i + 1]); LargeVis model; if (if_embed) diff --git a/README.md b/README.md index 78a0d35..91560b3 100644 --- a/README.md +++ b/README.md @@ -62,28 +62,28 @@ For networks, each line of the input file is a DIRECTED edge. For each undirecte ``` For C++ executable file, ```bash -./LargeVis -input -output +./LargeVis --input INPUT --output OUTPUT ``` or for Python, ```bash -python LargeVis_run.py -input -output +python LargeVis_run.py --input INPUT --output OUTPUT ``` -* `-input`: Input file of feature vectors or networks (see the Example folders for input format). -* `-output`: Output file of low-dimensional representations. +* `--input`: Input file of feature vectors or networks (see the Example folders for input format). +* `--output`: Output file of low-dimensional representations. Besides the two parameters, other optional parameters include: -* `-fea`: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1. -* `-threads`: Number of threads. Default is 8. -* `-outdim`: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2. -* `-samples`: Number of edge samples for graph layout (in millions). Default is set to ```data size / 100``` (million). -* `-prop`: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3. -* `-alpha`: Initial learning rate. Default is 1.0. -* `-trees`: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases unless you are dealing with very large datasets (e.g. data size over 5 million), and less trees are suitable for smaller datasets. Default is set according to the data size. -* `-neg`: Number of negative samples used for negative sampling. Default is 5. -* `-neigh`: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150. -* `-gamma`: The weights assigned to negative edges. Default is 7. -* `-perp`: The perplexity used for deciding edge weights in K-NNG. Default is 50. +* `--fea`: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1. +* `--threads`: Number of threads. Default is 8. +* `--outdim`: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2. +* `--samples`: Number of edge samples for graph layout (in millions). Default is set to ```data size / 100``` (million). +* `--prop`: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3. +* `--alpha`: Initial learning rate. Default is 1.0. +* `--trees`: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases unless you are dealing with very large datasets (e.g. data size over 5 million), and less trees are suitable for smaller datasets. Default is set according to the data size. +* `--neg`: Number of negative samples used for negative sampling. Default is 5. +* `--neigh`: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150. +* `--gamma`: The weights assigned to negative edges. Default is 7. +* `--perp`: The perplexity used for deciding edge weights in K-NNG. Default is 50. ## Examples @@ -91,8 +91,8 @@ We provide some examples including MNIST(high-dimensional feature vectors) and C For example, to visualize the MNIST dataset, ```bash -python LargeVis_run.py -input mnist_vec784D.txt -output mnist_vec2D.txt -threads 16 -python plot.py -input mnist_vec2D.txt -label mnist_label.txt -output mnist_vec2D_plot +python LargeVis_run.py --input mnist_vec784D.txt --output mnist_vec2D.txt --threads 16 +python plot.py --input mnist_vec2D.txt --label mnist_label.txt --output mnist_vec2D_plot ``` ![plot of mnist](Examples/MNIST/mnist_plot.png) diff --git a/Windows/main.cpp b/Windows/main.cpp index 22abea8..b0413b6 100644 --- a/Windows/main.cpp +++ b/Windows/main.cpp @@ -24,34 +24,34 @@ int main(int argc, char **argv) long long i; if (argc < 3) { - printf("-fea: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1.\n"); - printf("-input: Input file of feature vectors or networks\n"); - printf("-output: Output file of low-dimensional representations.\n"); - printf("-threads: Number of threads. Default is 8.\n"); - printf("-outdim: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2.\n"); - printf("-samples: Number of edge samples for graph layout (in millions). Default is set to data size / 100 (million).\n"); - printf("-prop: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3.\n"); - printf("-alpha: Initial learning rate. Default is 1.0.\n"); - printf("-trees: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases.\n"); - printf("-neg: Number of negative samples used for negative sampling. Default is 5.\n"); - printf("-neigh: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150.\n"); - printf("-gamma: The weights assigned to negative edges. Default is 7.\n"); - printf("-perp: The perplexity used for deciding edge weights in K-NNG. Default is 50.\n"); + printf("--fea: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1.\n"); + printf("--input: Input file of feature vectors or networks\n"); + printf("--output: Output file of low-dimensional representations.\n"); + printf("--threads: Number of threads. Default is 8.\n"); + printf("--outdim: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2.\n"); + printf("--samples: Number of edge samples for graph layout (in millions). Default is set to data size / 100 (million).\n"); + printf("--prop: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3.\n"); + printf("--alpha: Initial learning rate. Default is 1.0.\n"); + printf("--trees: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases.\n"); + printf("--neg: Number of negative samples used for negative sampling. Default is 5.\n"); + printf("--neigh: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150.\n"); + printf("--gamma: The weights assigned to negative edges. Default is 7.\n"); + printf("--perp: The perplexity used for deciding edge weights in K-NNG. Default is 50.\n"); return 0; } - if ((i = ArgPos((char *)"-fea", argc, argv)) > 0) if_embed = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-input", argc, argv)) > 0) strcpy(infile, argv[i + 1]); - if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(outfile, argv[i + 1]); - if ((i = ArgPos((char *)"-outdim", argc, argv)) > 0) out_dim = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-samples", argc, argv)) > 0) n_samples = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) n_threads = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-neg", argc, argv)) > 0) n_negative = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-neigh", argc, argv)) > 0) n_neighbors = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-trees", argc, argv)) > 0) n_trees = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-prop", argc, argv)) > 0) n_propagation = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); - if ((i = ArgPos((char *)"-gamma", argc, argv)) > 0) n_gamma = atof(argv[i + 1]); - if ((i = ArgPos((char *)"-perp", argc, argv)) > 0) perplexity = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--fea", argc, argv)) > 0) if_embed = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--input", argc, argv)) > 0) strcpy(infile, argv[i + 1]); + if ((i = ArgPos((char *)"--output", argc, argv)) > 0) strcpy(outfile, argv[i + 1]); + if ((i = ArgPos((char *)"--outdim", argc, argv)) > 0) out_dim = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--samples", argc, argv)) > 0) n_samples = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--threads", argc, argv)) > 0) n_threads = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--neg", argc, argv)) > 0) n_negative = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--neigh", argc, argv)) > 0) n_neighbors = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--trees", argc, argv)) > 0) n_trees = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--prop", argc, argv)) > 0) n_propagation = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--gamma", argc, argv)) > 0) n_gamma = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--perp", argc, argv)) > 0) perplexity = atof(argv[i + 1]); LargeVis model; if (if_embed) diff --git a/plot.py b/plot.py index 491f6da..49feed7 100755 --- a/plot.py +++ b/plot.py @@ -7,10 +7,10 @@ parser = argparse.ArgumentParser() -parser.add_argument('-input', default='', help='input file', required=True) -parser.add_argument('-label', default='', help='label file') -parser.add_argument('-output', default='', help='output file', required=True) -parser.add_argument('-range', default='', help='axis range') +parser.add_argument('--input', '-i', default='', help='input file', required=True) +parser.add_argument('--label', '-l', default='', help='label file') +parser.add_argument('--output', '-o', default='', help='output file', required=True) +parser.add_argument('--range', '-r', default='', help='axis range') args = parser.parse_args() From 1ebb98ff29e52175443edb7cbfa18d3da0761b28 Mon Sep 17 00:00:00 2001 From: "H@di" Date: Fri, 19 Feb 2021 18:36:22 +0330 Subject: [PATCH 07/19] add no-axis and legend options to plot script --- plot.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/plot.py b/plot.py index 49feed7..853706a 100755 --- a/plot.py +++ b/plot.py @@ -6,12 +6,12 @@ import matplotlib.pyplot as plt parser = argparse.ArgumentParser() - parser.add_argument('--input', '-i', default='', help='input file', required=True) parser.add_argument('--label', '-l', default='', help='label file') parser.add_argument('--output', '-o', default='', help='output file', required=True) -parser.add_argument('--range', '-r', default='', help='axis range') - +parser.add_argument('--range', '-r', type=float, help='axis range') +parser.add_argument('--no-axis', '-n', help='hide axis', action='store_true') +parser.add_argument('--legend', '-s', help='show legend', action='store_true') args = parser.parse_args() labels = {} @@ -30,12 +30,20 @@ colors = plt.cm.rainbow(numpy.linspace(0, 1, len(all_data))) -for color, ll in zip(colors, sorted(all_data.keys())): - x = [t[0] for t in all_data[ll]] - y = [t[1] for t in all_data[ll]] - plt.plot(x, y, '.', color=color, markersize=1) -if args.range != '': - l = abs(float(args.range)) - plt.xlim(-l, l) - plt.ylim(-l, l) +for color, label in zip(colors, sorted(all_data.keys())): + x = [t[0] for t in all_data[label]] + y = [t[1] for t in all_data[label]] + plt.plot(x, y, '.', color=color, markersize=1, label=label) + +if args.range: + axis_limit = abs(float(args.range)) + plt.xlim(-axis_limit, axis_limit) + plt.ylim(-axis_limit, axis_limit) + +if args.no_axis: + plt.axis('off') + +if args.legend: + plt.legend() + plt.savefig(args.output, dpi=500) From 130c4afdcb706c51b0ce203f890a148be79a8e94 Mon Sep 17 00:00:00 2001 From: "H@di" Date: Fri, 19 Feb 2021 23:41:28 +0330 Subject: [PATCH 08/19] change colormap --- plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plot.py b/plot.py index 853706a..c97be98 100755 --- a/plot.py +++ b/plot.py @@ -28,7 +28,7 @@ vec = line.strip().split(' ') all_data.setdefault(labels.get(vec[0], 0), []).append((float(vec[-2]), float(vec[-1]))) -colors = plt.cm.rainbow(numpy.linspace(0, 1, len(all_data))) +colors = plt.cm.tab10(numpy.linspace(0, 1, len(all_data))) for color, label in zip(colors, sorted(all_data.keys())): x = [t[0] for t in all_data[label]] From b1d6b8edba195610128d5c3f98a6d49914bfa84f Mon Sep 17 00:00:00 2001 From: "H@di" Date: Sat, 20 Feb 2021 01:00:40 +0330 Subject: [PATCH 09/19] check required args --- Linux/main.cpp | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/Linux/main.cpp b/Linux/main.cpp index b0413b6..72ac6fe 100644 --- a/Linux/main.cpp +++ b/Linux/main.cpp @@ -22,24 +22,7 @@ int ArgPos(char *str, int argc, char **argv) { int main(int argc, char **argv) { long long i; - if (argc < 3) - { - printf("--fea: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1.\n"); - printf("--input: Input file of feature vectors or networks\n"); - printf("--output: Output file of low-dimensional representations.\n"); - printf("--threads: Number of threads. Default is 8.\n"); - printf("--outdim: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2.\n"); - printf("--samples: Number of edge samples for graph layout (in millions). Default is set to data size / 100 (million).\n"); - printf("--prop: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3.\n"); - printf("--alpha: Initial learning rate. Default is 1.0.\n"); - printf("--trees: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases.\n"); - printf("--neg: Number of negative samples used for negative sampling. Default is 5.\n"); - printf("--neigh: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150.\n"); - printf("--gamma: The weights assigned to negative edges. Default is 7.\n"); - printf("--perp: The perplexity used for deciding edge weights in K-NNG. Default is 50.\n"); - return 0; - } - if ((i = ArgPos((char *)"--fea", argc, argv)) > 0) if_embed = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--fea", argc, argv)) > 0) if_embed = atoi(argv[i + 1]); if ((i = ArgPos((char *)"--input", argc, argv)) > 0) strcpy(infile, argv[i + 1]); if ((i = ArgPos((char *)"--output", argc, argv)) > 0) strcpy(outfile, argv[i + 1]); if ((i = ArgPos((char *)"--outdim", argc, argv)) > 0) out_dim = atoi(argv[i + 1]); @@ -53,11 +36,30 @@ int main(int argc, char **argv) if ((i = ArgPos((char *)"--gamma", argc, argv)) > 0) n_gamma = atof(argv[i + 1]); if ((i = ArgPos((char *)"--perp", argc, argv)) > 0) perplexity = atof(argv[i + 1]); + if (argc < 3 || strlen(infile) == 0 || strlen(outfile) == 0) + { + printf("--fea: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1.\n"); + printf("--input: Input file of feature vectors or networks\n"); + printf("--output: Output file of low-dimensional representations.\n"); + printf("--threads: Number of threads. Default is 8.\n"); + printf("--outdim: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2.\n"); + printf("--samples: Number of edge samples for graph layout (in millions). Default is set to data size / 100 (million).\n"); + printf("--prop: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3.\n"); + printf("--alpha: Initial learning rate. Default is 1.0.\n"); + printf("--trees: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases.\n"); + printf("--neg: Number of negative samples used for negative sampling. Default is 5.\n"); + printf("--neigh: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150.\n"); + printf("--gamma: The weights assigned to negative edges. Default is 7.\n"); + printf("--perp: The perplexity used for deciding edge weights in K-NNG. Default is 50.\n"); + return 2; + } + LargeVis model; - if (if_embed) - model.load_from_file(infile); - else - model.load_from_graph(infile); + if (if_embed) + model.load_from_file(infile); + else + model.load_from_graph(infile); + model.run(out_dim, n_threads, n_samples, n_propagation, alpha, n_trees, n_negative, n_neighbors, n_gamma, perplexity); model.save(outfile); From cf63bc9f9f992c01c80f047d064d705d68005240 Mon Sep 17 00:00:00 2001 From: "H@di" Date: Sat, 20 Feb 2021 01:04:40 +0330 Subject: [PATCH 10/19] unify indentations --- Linux/LargeVis.cpp | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/Linux/LargeVis.cpp b/Linux/LargeVis.cpp index 022e78e..1541800 100644 --- a/Linux/LargeVis.cpp +++ b/Linux/LargeVis.cpp @@ -8,7 +8,7 @@ LargeVis::LargeVis() knn_vec = old_knn_vec = NULL; annoy_index = NULL; head = alias = NULL; - neg_table = NULL; + neg_table = NULL; } const gsl_rng_type *LargeVis::gsl_T = NULL; @@ -26,8 +26,8 @@ void LargeVis::clean_model() vis = prob = NULL; knn_vec = old_knn_vec = NULL; annoy_index = NULL; - neg_table = NULL; - alias = NULL; + neg_table = NULL; + alias = NULL; edge_count_actual = 0; neg_size = 1e8; @@ -56,7 +56,7 @@ void LargeVis::load_from_file(char *infile) printf("\nFile not found!\n"); return; } - printf("Reading input file %s ......", infile); fflush(stdout); + printf("Reading input file %s ......", infile); fflush(stdout); fscanf(fin, "%lld%lld", &n_vertices, &n_dim); vec = new real[n_vertices * n_dim]; for (long long i = 0; i < n_vertices; ++i) @@ -162,7 +162,7 @@ long long LargeVis::get_out_dim() void LargeVis::normalize() { - printf("Normalizing ......"); fflush(stdout); + printf("Normalizing ......"); fflush(stdout); real *mean = new real[n_dim]; for (long long i = 0; i < n_dim; ++i) mean[i] = 0; for (long long i = 0, ll = 0; i < n_vertices; ++i, ll += n_dim) @@ -281,7 +281,7 @@ void *LargeVis::annoy_thread_caller(void *arg) void LargeVis::run_annoy() { - printf("Running ANNOY ......"); fflush(stdout); + printf("Running ANNOY ......"); fflush(stdout); annoy_index = new AnnoyIndex(n_dim); for (long long i = 0; i < n_vertices; ++i) annoy_index->add_item(i, &vec[i * n_dim]); @@ -293,7 +293,7 @@ void LargeVis::run_annoy() for (int j = 0; j < n_threads; ++j) pthread_create(&pt[j], NULL, LargeVis::annoy_thread_caller, new arg_struct(this, j)); for (int j = 0; j < n_threads; ++j) pthread_join(pt[j], NULL); delete[] pt; - delete annoy_index; annoy_index = NULL; + delete annoy_index; annoy_index = NULL; printf(" Done.\n"); } @@ -375,7 +375,7 @@ void LargeVis::compute_similarity_thread(int id) for (iter = 0; iter < 200; ++iter) { H = 0; - sum_weight = FLT_MIN; + sum_weight = FLT_MIN; for (p = head[x]; p >= 0; p = next[p]) { sum_weight += tmp = exp(-beta * edge_weight[p]); @@ -392,8 +392,8 @@ void LargeVis::compute_similarity_thread(int id) hi_beta = beta; if (lo_beta < 0) beta /= 2; else beta = (lo_beta + beta) / 2; } - if(beta > FLT_MAX) beta = FLT_MAX; - } + if(beta > FLT_MAX) beta = FLT_MAX; + } for (p = head[x], sum_weight = FLT_MIN; p >= 0; p = next[p]) { sum_weight += edge_weight[p] = exp(-beta * edge_weight[p]); @@ -440,7 +440,7 @@ void *LargeVis::search_reverse_thread_caller(void *arg) void LargeVis::compute_similarity() { - printf("Computing similarities ......"); fflush(stdout); + printf("Computing similarities ......"); fflush(stdout); n_edge = 0; head = new long long[n_vertices]; long long i, x, y, p, q; @@ -458,8 +458,8 @@ void LargeVis::compute_similarity() head[x] = n_edge++; } } - delete[] vec; vec = NULL; - delete[] knn_vec; knn_vec = NULL; + delete[] vec; vec = NULL; + delete[] knn_vec; knn_vec = NULL; pthread_t *pt = new pthread_t[n_threads]; for (int j = 0; j < n_threads; ++j) pthread_create(&pt[j], NULL, LargeVis::compute_similarity_thread_caller, new arg_struct(this, j)); for (int j = 0; j < n_threads; ++j) pthread_join(pt[j], NULL); @@ -515,7 +515,7 @@ void LargeVis::test_accuracy() ++hit_case; } } - delete heap; + delete heap; printf("Test knn accuracy : %.2f%%\n", hit_case * 100.0 / (test_case * n_neighbors)); } @@ -542,7 +542,7 @@ void LargeVis::init_neg_table() { long long x, p, i; neg_size = 1e8; - reverse.clear(); vector (reverse).swap(reverse); + reverse.clear(); vector (reverse).swap(reverse); real sum_weights = 0, dd, *weights = new real[n_vertices]; for (i = 0; i < n_vertices; ++i) weights[i] = 0; for (x = 0; x < n_vertices; ++x) @@ -553,8 +553,8 @@ void LargeVis::init_neg_table() } sum_weights += weights[x] = pow(weights[x], 0.75); } - next.clear(); vector (next).swap(next); - delete[] head; head = NULL; + next.clear(); vector (next).swap(next); + delete[] head; head = NULL; neg_table = new int[neg_size]; dd = weights[0]; for (i = x = 0; i < neg_size; ++i) From 4483228b4babf11deb9351b11d94048a7b4bf38b Mon Sep 17 00:00:00 2001 From: "H@di" Date: Sat, 20 Feb 2021 01:25:37 +0330 Subject: [PATCH 11/19] handle inputs without weight --- Linux/LargeVis.cpp | 14 ++++++++++++-- Linux/LargeVis.h | 2 +- Linux/main.cpp | 7 +++++-- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/Linux/LargeVis.cpp b/Linux/LargeVis.cpp index 1541800..cd680a9 100644 --- a/Linux/LargeVis.cpp +++ b/Linux/LargeVis.cpp @@ -80,7 +80,17 @@ void LargeVis::load_from_data(real *data, long long n_vert, long long n_di) printf("Total vertices : %lld\tDimension : %lld\n", n_vertices, n_dim); } -void LargeVis::load_from_graph(char *infile) +bool load_edge_from_graph(FILE *fin, char *w1, char *w2, real *weight, bool use_default_weight) { + if (use_default_weight) + { + (*weight) = 1; + return fscanf(fin, "%s%s", w1, w2) == 2; + } + else + return fscanf(fin, "%s%s%f", w1, w2, weight) == 3; +} + +void LargeVis::load_from_graph(char *infile, bool use_default_weight) { clean_data(); char *w1 = new char[1000]; @@ -96,7 +106,7 @@ void LargeVis::load_from_graph(char *infile) return; } printf("Reading input file %s ......%c", infile, 13); - while (fscanf(fin, "%s%s%f", w1, w2, &weight) == 3) + while (load_edge_from_graph(fin, w1, w2, &weight, use_default_weight)) { if (!dict.count(w1)) { dict[w1] = n_vertices++; names.push_back(w1); } if (!dict.count(w2)) { dict[w2] = n_vertices++; names.push_back(w2); } diff --git a/Linux/LargeVis.h b/Linux/LargeVis.h index 6914ac5..20c2ba9 100644 --- a/Linux/LargeVis.h +++ b/Linux/LargeVis.h @@ -67,7 +67,7 @@ class LargeVis{ public: LargeVis(); void load_from_file(char *infile); - void load_from_graph(char *infile); + void load_from_graph(char *infile, bool use_default_weight = false); void load_from_data(real *data, long long n_vert, long long n_di); void save(char *outfile); void run(long long out_d = -1, long long n_thre = -1, long long n_samp = -1, long long n_prop = -1, real alph = -1, long long n_tree = -1, long long n_nega = -1, long long n_neig = -1, real gamm = -1, real perp = -1); diff --git a/Linux/main.cpp b/Linux/main.cpp index 72ac6fe..991e7a9 100644 --- a/Linux/main.cpp +++ b/Linux/main.cpp @@ -6,11 +6,12 @@ char infile[1000], outfile[1000]; long long if_embed = 1, out_dim = -1, n_samples = -1, n_threads = -1, n_negative = -1, n_neighbors = -1, n_trees = -1, n_propagation = -1; real alpha = -1, n_gamma = -1, perplexity = -1; +bool use_default_weight = false; int ArgPos(char *str, int argc, char **argv) { int a; for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { - if (a == argc - 1) { + if (a == argc - 1 && strcmp(str, "--default-weight")) { printf("Argument missing for %s\n", str); exit(1); } @@ -35,6 +36,7 @@ int main(int argc, char **argv) if ((i = ArgPos((char *)"--alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); if ((i = ArgPos((char *)"--gamma", argc, argv)) > 0) n_gamma = atof(argv[i + 1]); if ((i = ArgPos((char *)"--perp", argc, argv)) > 0) perplexity = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--default-weight", argc, argv)) > 0) use_default_weight = true; if (argc < 3 || strlen(infile) == 0 || strlen(outfile) == 0) { @@ -51,6 +53,7 @@ int main(int argc, char **argv) printf("--neigh: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150.\n"); printf("--gamma: The weights assigned to negative edges. Default is 7.\n"); printf("--perp: The perplexity used for deciding edge weights in K-NNG. Default is 50.\n"); + printf("--default-weight: Use 1 as weight of edges instead of reading weight from edge list.\n"); return 2; } @@ -58,7 +61,7 @@ int main(int argc, char **argv) if (if_embed) model.load_from_file(infile); else - model.load_from_graph(infile); + model.load_from_graph(infile, use_default_weight); model.run(out_dim, n_threads, n_samples, n_propagation, alpha, n_trees, n_negative, n_neighbors, n_gamma, perplexity); From 012dcf4b39e0962eb1f4312959fb364092617fb5 Mon Sep 17 00:00:00 2001 From: "H@di" Date: Sat, 20 Feb 2021 01:31:29 +0330 Subject: [PATCH 12/19] fix warnings --- Linux/LargeVis.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/Linux/LargeVis.cpp b/Linux/LargeVis.cpp index cd680a9..9a47818 100644 --- a/Linux/LargeVis.cpp +++ b/Linux/LargeVis.cpp @@ -57,13 +57,22 @@ void LargeVis::load_from_file(char *infile) return; } printf("Reading input file %s ......", infile); fflush(stdout); - fscanf(fin, "%lld%lld", &n_vertices, &n_dim); + if (fscanf(fin, "%lld%lld", &n_vertices, &n_dim) != 2) { + printf("Could not read dimensions\n"); + fclose(fin); + exit(1); + } vec = new real[n_vertices * n_dim]; for (long long i = 0; i < n_vertices; ++i) { for (long long j = 0; j < n_dim; ++j) { - fscanf(fin, "%f", &vec[i * n_dim + j]); + if (fscanf(fin, "%f", &vec[i * n_dim + j]) != 1) + { + fclose(fin); + printf("Could not read line %lld\n", i + 1); + exit(1); + } } } fclose(fin); @@ -358,7 +367,7 @@ void LargeVis::run_propagation() { for (int i = 0; i < n_propagations; ++i) { - printf("Running propagation %d/%d%c", i + 1, n_propagations, 13); + printf("Running propagation %d/%lld%c", i + 1, n_propagations, 13); fflush(stdout); old_knn_vec = knn_vec; knn_vec = new std::vector[n_vertices]; From b1d45acd13d9c5189bcfe45311cfb7d5a234355c Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Mon, 3 Apr 2017 20:40:50 -0400 Subject: [PATCH 13/19] Make module Python3 compatible. --- Linux/LargeVismodule.cpp | 74 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/Linux/LargeVismodule.cpp b/Linux/LargeVismodule.cpp index bbc50d9..597685f 100644 --- a/Linux/LargeVismodule.cpp +++ b/Linux/LargeVismodule.cpp @@ -1,6 +1,19 @@ #include "Python.h" #include "LargeVis.h" +struct module_state { + PyObject *error; +}; + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) +#else +#define GETSTATE(m) (&_state) +static struct module_state _state; +#endif + + real *out_vec; LargeVis model; char *filename; @@ -94,7 +107,11 @@ static PyObject *LoadFromList(PyObject *self, PyObject *args) } for (long long j = 0; j < n_dim; ++j) { +#ifdef IS_PY3K + real x = atof(PyBytes_AS_STRING(PyObject_Bytes(PyList_GetItem(vec, j)))); +#else real x = atof(PyString_AsString(PyObject_Str(PyList_GetItem(vec, j)))); +#endif data[ll + j] = x; } } @@ -114,7 +131,8 @@ static PyObject *SaveToFile(PyObject *self, PyObject *args) return Py_None; } -static PyMethodDef PyExtMethods[] = + +static PyMethodDef LargeVis_methods[] = { { "run", Run, METH_VARARGS, "(All arguments are optional.\nrun(output dimension, threads number, training samples, propagations number, learning rate, rp-trees number, negative samples number, neighbors number, gamma, perplexity)\nFire up LargeVis." }, { "loadfile", LoadFromFile, METH_VARARGS, "loadfile(str filename)\nLoad high-dimensional feature vectors from file." }, @@ -124,8 +142,58 @@ static PyMethodDef PyExtMethods[] = { NULL, NULL, 0, NULL } }; -PyMODINIT_FUNC initLargeVis() +#if PY_MAJOR_VERSION >= 3 + +static int LargeVis_traverse(PyObject *m, visitproc visit, void *arg) { + Py_VISIT(GETSTATE(m)->error); + return 0; +} + +static int LargeVis_clear(PyObject *m) { + Py_CLEAR(GETSTATE(m)->error); + return 0; +} +static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "myextension", + NULL, + sizeof(struct module_state), + LargeVis_methods, + NULL, + LargeVis_traverse, + LargeVis_clear, + NULL +}; + +#define INITERROR return NULL + +PyMODINIT_FUNC +PyInit_myextension(void) + +#else +#define INITERROR return + +void +initLargeVis(void) +#endif { printf("LargeVis successfully imported!\n"); - Py_InitModule("LargeVis", PyExtMethods); +#if PY_MAJOR_VERSION >= 3 + PyObject *module = PyModule_Create(&moduledef); +#else + PyObject *module = Py_InitModule("LargeVis", LargeVis_methods); +#endif + if (module == NULL) + INITERROR; + struct module_state *st = GETSTATE(module); + + st->error = PyErr_NewException("LargeVis.Error", NULL, NULL); + if (st->error == NULL) { + Py_DECREF(module); + INITERROR; + } + +#if PY_MAJOR_VERSION >= 3 + return module; +#endif } From dfa2a4bbb60f556b61a29683735a9d858a0d8689 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Thu, 19 Oct 2017 16:23:12 -0400 Subject: [PATCH 14/19] As per requested changes. --- Linux/LargeVismodule.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Linux/LargeVismodule.cpp b/Linux/LargeVismodule.cpp index 597685f..ccab235 100644 --- a/Linux/LargeVismodule.cpp +++ b/Linux/LargeVismodule.cpp @@ -155,7 +155,7 @@ static int LargeVis_clear(PyObject *m) { } static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, - "myextension", + "LargeVis", NULL, sizeof(struct module_state), LargeVis_methods, @@ -168,7 +168,7 @@ static struct PyModuleDef moduledef = { #define INITERROR return NULL PyMODINIT_FUNC -PyInit_myextension(void) +PyInit_LargeVis(void) #else #define INITERROR return From 174f3d854950584294ea1fa50b678f6464bb80a7 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sun, 2 Sep 2018 13:37:18 -0400 Subject: [PATCH 15/19] Support py3 and loading numpy arrays --- Linux/LargeVismodule.cpp | 50 ++++++++++++++++++++++++++++++++++++++-- Linux/setup.py | 3 ++- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/Linux/LargeVismodule.cpp b/Linux/LargeVismodule.cpp index ccab235..23ccf16 100644 --- a/Linux/LargeVismodule.cpp +++ b/Linux/LargeVismodule.cpp @@ -1,5 +1,6 @@ #include "Python.h" #include "LargeVis.h" +#include "numpy/arrayobject.h" struct module_state { PyObject *error; @@ -120,6 +121,51 @@ static PyObject *LoadFromList(PyObject *self, PyObject *args) return Py_None; } +static PyObject *LoadFromArray(PyObject *self, PyObject *args) +{ + PyArrayObject *input; + long long n_vertices; + long long n_dim; + + //printf("Starting LoadFromArray\n"); + + if (!PyArg_ParseTuple(args, "O", &input)) return NULL; + + if (NULL == input) return NULL; + + //printf("Got input object parsed as array\n"); + + // Verify we have a 2D array of doubles + if ((PyArray_NDIM(input) != 2) || (!PyArray_ISFLOAT(input))) return NULL; + + n_vertices = PyArray_DIM(input, 0); + n_dim = PyArray_DIM(input, 1); + + //printf("Read array data as shape (%i, %i)\n", n_vertices, n_dim); + + //real *data = new real[n_vertices * n_dim]; + + //printf("Allocated new data array\n", n_vertices, n_dim); + + real *indata = (real *) PyArray_DATA(input); + + // printf("Got pointer to input data\n"); + + // for (long long i = 0; i < n_vertices; ++i) { + // printf("Processing row %i\n", i); + // for (long long j = 0; j < n_dim; ++j) { + // // data[i * n_dim + j] = (real) *((real *) PyArray_GETPTR2(input, i, j)); + // printf("processing col %i\n", j); + // data[i * n_dim + j] = indata[i * n_dim + j]; + // } + // } + + //printf("Completed reading in data from numpy array\n"); + + model.load_from_data(indata, n_vertices, n_dim); + return Py_None; +} + static PyObject *SaveToFile(PyObject *self, PyObject *args) { if (!PyArg_ParseTuple(args, "s", &filename)) @@ -138,6 +184,7 @@ static PyMethodDef LargeVis_methods[] = { "loadfile", LoadFromFile, METH_VARARGS, "loadfile(str filename)\nLoad high-dimensional feature vectors from file." }, { "loadgraph", LoadFromGraph, METH_VARARGS, "loadfile(str filename)\nLoad graph from file." }, { "loaddata", LoadFromList, METH_VARARGS, "loaddata(X)\nLoad data from list." }, + { "loadarray", LoadFromArray, METH_VARARGS, "loadarray(X)\nLoad data from a numpy array."}, { "save", SaveToFile, METH_VARARGS, "save(str filename)\nSave data to file." }, { NULL, NULL, 0, NULL } }; @@ -173,11 +220,10 @@ PyInit_LargeVis(void) #else #define INITERROR return -void +PyMODINIT_FUNC initLargeVis(void) #endif { - printf("LargeVis successfully imported!\n"); #if PY_MAJOR_VERSION >= 3 PyObject *module = PyModule_Create(&moduledef); #else diff --git a/Linux/setup.py b/Linux/setup.py index 029c8f5..3d20921 100644 --- a/Linux/setup.py +++ b/Linux/setup.py @@ -1,9 +1,10 @@ from distutils.core import setup, Extension +import numpy as np LargeVis = Extension('LargeVis', sources = ['LargeVis.cpp', 'LargeVismodule.cpp'], depends=['LargeVis.h'], - include_dirs = ['/usr/local/include'], + include_dirs = ['/usr/local/include', np.get_include()], library_dirs = ['/usr/local/lib'], libraries=['gsl', 'gslcblas'], extra_compile_args=['-lm -pthread -lgsl -lgslcblas -Ofast -march=native -ffast-math']) From a62e8e0873813fd119c9889234df456517543732 Mon Sep 17 00:00:00 2001 From: "H@di" Date: Sun, 21 Feb 2021 13:29:57 +0330 Subject: [PATCH 16/19] use empty string as default cluster name --- plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plot.py b/plot.py index c97be98..91aef5f 100755 --- a/plot.py +++ b/plot.py @@ -26,7 +26,7 @@ _ = f.readline() # ignore first line for line in f: vec = line.strip().split(' ') - all_data.setdefault(labels.get(vec[0], 0), []).append((float(vec[-2]), float(vec[-1]))) + all_data.setdefault(labels.get(vec[0], ''), []).append((float(vec[-2]), float(vec[-1]))) colors = plt.cm.tab10(numpy.linspace(0, 1, len(all_data))) From e08d50e688a983c2b697bda85ae5617939e6b9da Mon Sep 17 00:00:00 2001 From: "H@di" Date: Sun, 21 Feb 2021 13:30:17 +0330 Subject: [PATCH 17/19] make plot square --- plot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/plot.py b/plot.py index 91aef5f..ba3f53b 100755 --- a/plot.py +++ b/plot.py @@ -46,4 +46,5 @@ if args.legend: plt.legend() +plt.gca().set_aspect('equal', adjustable='box') plt.savefig(args.output, dpi=500) From 925852be0895438402687fdeb4098936a89aaef7 Mon Sep 17 00:00:00 2001 From: "H@di" Date: Thu, 25 Feb 2021 01:44:16 +0330 Subject: [PATCH 18/19] add annotation --- plot.py | 63 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/plot.py b/plot.py index ba3f53b..d7c6937 100755 --- a/plot.py +++ b/plot.py @@ -2,38 +2,55 @@ import argparse -import numpy -import matplotlib.pyplot as plt +import numpy as np +from matplotlib import rcParams, pyplot as plt + +rcParams["svg.fonttype"] = "none" parser = argparse.ArgumentParser() -parser.add_argument('--input', '-i', default='', help='input file', required=True) -parser.add_argument('--label', '-l', default='', help='label file') -parser.add_argument('--output', '-o', default='', help='output file', required=True) -parser.add_argument('--range', '-r', type=float, help='axis range') -parser.add_argument('--no-axis', '-n', help='hide axis', action='store_true') -parser.add_argument('--legend', '-s', help='show legend', action='store_true') +parser.add_argument("--input", "-i", default="", help="input file", required=True) +parser.add_argument("--output", "-o", default="", help="output file", required=True) +parser.add_argument("--clusters", "-c", default="", help="clusters file") +parser.add_argument("--labels", "-l", default="", help="labels to annotate file") +parser.add_argument("--range", "-r", type=float, help="axis range") +parser.add_argument("--no-axis", "-n", help="hide axis", action="store_true") +parser.add_argument("--legend", "-s", help="show legend", action="store_true") args = parser.parse_args() -labels = {} -if args.label != '': - with open(args.label) as f: +clusters = {} +if args.clusters != "": + with open(args.clusters) as f: for line in f: - node_id, label = line.strip().split() - labels[node_id] = label + node, cluster = line.strip().split() + clusters[node] = cluster + +lables = [] +if args.labels != "": + with open(args.labels) as f: + lables = list(map(lambda line: line.strip(), f)) -all_data = {} +positions_by_cluster = {} +positions = {} with open(args.input) as f: _ = f.readline() # ignore first line for line in f: - vec = line.strip().split(' ') - all_data.setdefault(labels.get(vec[0], ''), []).append((float(vec[-2]), float(vec[-1]))) + vec = line.strip().split(" ") + node = vec[0] + pos = (float(vec[-2]), float(vec[-1])) + positions[node] = pos + positions_by_cluster.setdefault(clusters.get(node, ""), []).append(pos) + +colors = plt.cm.tab10(np.linspace(0, 1, len(positions_by_cluster))) + +for color, cluster in zip(colors, sorted(positions_by_cluster.keys())): + x = [t[0] for t in positions_by_cluster[cluster]] + y = [t[1] for t in positions_by_cluster[cluster]] + plt.plot(x, y, ".", color=color, markersize=1, label=cluster) -colors = plt.cm.tab10(numpy.linspace(0, 1, len(all_data))) -for color, label in zip(colors, sorted(all_data.keys())): - x = [t[0] for t in all_data[label]] - y = [t[1] for t in all_data[label]] - plt.plot(x, y, '.', color=color, markersize=1, label=label) +for node in lables: + x, y = positions[node] + plt.annotate(node, xy=(x, y), xytext=(x - 3, y - 3), arrowprops=dict(arrowstyle="-"), fontsize="xx-small") if args.range: axis_limit = abs(float(args.range)) @@ -41,10 +58,10 @@ plt.ylim(-axis_limit, axis_limit) if args.no_axis: - plt.axis('off') + plt.axis("off") if args.legend: plt.legend() -plt.gca().set_aspect('equal', adjustable='box') +plt.gca().set_aspect("equal", adjustable="box") plt.savefig(args.output, dpi=500) From 62a277d348ad17e426a005705f737de6ca0dc3e7 Mon Sep 17 00:00:00 2001 From: "H@di" Date: Thu, 25 Feb 2021 01:58:06 +0330 Subject: [PATCH 19/19] handle overcomplete lable list --- plot.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/plot.py b/plot.py index d7c6937..630c274 100755 --- a/plot.py +++ b/plot.py @@ -49,6 +49,8 @@ for node in lables: + if node not in positions: + continue x, y = positions[node] plt.annotate(node, xy=(x, y), xytext=(x - 3, y - 3), arrowprops=dict(arrowstyle="-"), fontsize="xx-small")