From 66db70a61605b0876f6203f58a1517ee5b17b9bf Mon Sep 17 00:00:00 2001 From: Arjun V Date: Mon, 16 Jul 2018 17:40:01 +0530 Subject: [PATCH 1/8] Update requirements.txt Adding flask as a requirement --- requirements.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index ccbe920..1ffe0bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ -ImageHash==3.4 -scikit-learn==0.18 -numpy==1.11.1 +ImageHash +scikit-learn +numpy +flask From 44fc9e7935748d5537924ad35b2384d691ba7221 Mon Sep 17 00:00:00 2001 From: Arjun V Date: Mon, 16 Jul 2018 17:41:06 +0530 Subject: [PATCH 2/8] Update readme.md Correcting run command --- readme.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/readme.md b/readme.md index 44c850f..42e0933 100644 --- a/readme.md +++ b/readme.md @@ -4,6 +4,6 @@ this is an unrealistically simple example, but this clusters "brain expanding" m install deps: `pip install -r requirements.txt` -to run: `python cluster.py` +to run: `python2 server.py` -went with wavelet hashing because of the results of my unscientific experimentation (see `tests.py`) \ No newline at end of file +went with wavelet hashing because of the results of my unscientific experimentation (see `tests.py`) From f9084db5ecba6412cb0252621dbc243cbbb1a146 Mon Sep 17 00:00:00 2001 From: Arjun V Date: Wed, 18 Jul 2018 11:54:46 +0530 Subject: [PATCH 3/8] Update readme.md --- readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readme.md b/readme.md index 42e0933..1a90887 100644 --- a/readme.md +++ b/readme.md @@ -4,6 +4,6 @@ this is an unrealistically simple example, but this clusters "brain expanding" m install deps: `pip install -r requirements.txt` -to run: `python2 server.py` +to run: `python3 server.py` went with wavelet hashing because of the results of my unscientific experimentation (see `tests.py`) From e2ae27e7c4f6ce6ce82e678e6f0b0bb9b3ae3962 Mon Sep 17 00:00:00 2001 From: arjun v Date: Thu, 19 Jul 2018 12:49:10 +0530 Subject: [PATCH 4/8] Added clustering images to directory Changed default action to move images into a directory named "clusters". Accepts arguments for running the flask server. --- .gitignore | 4 ++ cluster.py | 11 +++++- data/.gitignore | 2 - readme.md | 18 +++++++-- requirements.txt | 7 ++-- server.py | 88 +++++++++++++++++++++++++++++++------------ static/img/.gitignore | 2 - 7 files changed, 96 insertions(+), 36 deletions(-) create mode 100644 .gitignore delete mode 100644 data/.gitignore delete mode 100644 static/img/.gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b5934e4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.directory +data/ +static/img/ +clusters/ \ No newline at end of file diff --git a/cluster.py b/cluster.py index 9983a3b..a27a93d 100644 --- a/cluster.py +++ b/cluster.py @@ -1,3 +1,4 @@ +import os import imagehash import numpy as np from PIL import Image @@ -38,4 +39,12 @@ def cluster(mat, fnames, eps, min_samples): clusters = defaultdict(list) for i, lbl in enumerate(labels): clusters[lbl].append(fnames[i]) - return clusters \ No newline at end of file + return clusters + +def create_directory(directory): + try: + os.makedirs(directory) + return True + except OSError: + if not os.path.isdir(directory): return False + return True \ No newline at end of file diff --git a/data/.gitignore b/data/.gitignore deleted file mode 100644 index c96a04f..0000000 --- a/data/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore \ No newline at end of file diff --git a/readme.md b/readme.md index 44c850f..ed803d1 100644 --- a/readme.md +++ b/readme.md @@ -2,8 +2,20 @@ super simple example of image clustering with image hashes ([wavelet hashing](ht this is an unrealistically simple example, but this clusters "brain expanding" memes from "political compass" memes pretty well. -install deps: `pip install -r requirements.txt` +install deps: `pip3 install -r requirements.txt` -to run: `python cluster.py` +went with wavelet hashing because of the results of my unscientific experimentation (see `tests.py`) -went with wavelet hashing because of the results of my unscientific experimentation (see `tests.py`) \ No newline at end of file +to run: `python3 server.py` + +``` +usage: server.py [-h] [-s] [--cluster distance] [--min-samples count] + +Image Cluster + +optional arguments: + -h, --help show this help message and exit + -s, --server run flask server (default: False) + --cluster distance cluster distance (default: 1) + --min-samples count minimum number of samples in a cluster (default: 1) +``` \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ccbe920..955d3bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ -ImageHash==3.4 -scikit-learn==0.18 -numpy==1.11.1 +ImageHash +scikit-learn +numpy +flask \ No newline at end of file diff --git a/server.py b/server.py index 6137ef3..bab3103 100644 --- a/server.py +++ b/server.py @@ -1,16 +1,38 @@ import json +import shutil import numpy as np from glob import glob -from flask import Flask, request, render_template, abort -from cluster import compute_hashes, compute_dists, cluster +from cluster import compute_hashes, compute_dists, cluster, create_directory + +import argparse +parser = argparse.ArgumentParser(description='Image Cluster') + +# begin arguments +parser.add_argument('-s', '--server', action='store_true', + help='run flask server (default: %(default)s)\n\n') +parser.add_argument('--cluster', metavar='distance', default=1, + type=float, help='cluster distance (default: %(default)s)') +parser.add_argument('--min-samples', metavar='count', default=1, + type=int, help='minimum number of samples in a cluster (default: %(default)s)') + +args = parser.parse_args() if __name__ == '__main__': + + if not create_directory('static/img/'): + print('Couldn\'t create static images directory!') + raise SystemExit + try: mat = np.load('data/dist_mat.npy') fnames = json.load(open('data/fnames.json', 'r')) except FileNotFoundError: - print('computing hashes & distance matrix...') + if not create_directory('data'): + print('Couldn\'t create data directory!') + raise SystemExit + + print('Computing hashes & distance matrix...') hashes, fnames = compute_hashes(glob('static/img/*')) mat = compute_dists(hashes) @@ -19,25 +41,41 @@ json.dump(fnames, f) np.save('data/dist_mat.npy', mat) - clusters = {} - app = Flask(__name__) - - @app.route('/', methods=['GET', 'POST']) - def index(): - global clusters - if request.method == 'POST': - eps = float(request.form.get('eps', 20)) - min_samples = int(request.form.get('min_samples', 2)) - clusters = cluster(mat, fnames, eps, min_samples) - print(clusters) - return render_template('index.html', clusters=clusters) - - @app.route('/cluster/', methods=['GET', 'POST']) - def view_cluster(id): - try: - print(clusters[id]) - return render_template('cluster.html', cluster=clusters[id]) - except KeyError: - abort(404) - - app.run(host='0.0.0.0', port=5001) + + if not args.server: + eps = args.cluster + min_samples = args.min_samples + clusters = cluster(mat, fnames, eps, min_samples) + + print('Generated {} clusters!'.format(len(clusters))) + print('Moving images to "clusters" directory..') + for cluster in clusters: + create_directory('clusters/{}'.format(cluster)) + for image in clusters[cluster]: + shutil.move(image, 'clusters/{}/'.format(cluster)) + print('Created {} clusters and stored the images in "clusters" directory!'.format(len(clusters))) + + else: + from flask import Flask, request, render_template, abort + clusters = {} + app = Flask(__name__) + + @app.route('/', methods=['GET', 'POST']) + def index(): + global clusters + if request.method == 'POST': + eps = float(request.form.get('eps', 20)) + min_samples = int(request.form.get('min_samples', 2)) + clusters = cluster(mat, fnames, eps, min_samples) + print(clusters) + return render_template('index.html', clusters=clusters) + + @app.route('/cluster/', methods=['GET', 'POST']) + def view_cluster(id): + try: + print(clusters[id]) + return render_template('cluster.html', cluster=clusters[id]) + except KeyError: + abort(404) + + app.run(host='0.0.0.0', port=5002) diff --git a/static/img/.gitignore b/static/img/.gitignore deleted file mode 100644 index c96a04f..0000000 --- a/static/img/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore \ No newline at end of file From de3a965eb21012d0f0563bef00fdfee837a874ca Mon Sep 17 00:00:00 2001 From: arjun v Date: Thu, 19 Jul 2018 13:00:18 +0530 Subject: [PATCH 5/8] Added relevant help in README.md --- readme.md => README.md | 10 +++++----- requirements.txt | 2 +- tests.py | 33 --------------------------------- 3 files changed, 6 insertions(+), 39 deletions(-) rename readme.md => README.md (64%) delete mode 100644 tests.py diff --git a/readme.md b/README.md similarity index 64% rename from readme.md rename to README.md index 9b08200..1b9ae52 100644 --- a/readme.md +++ b/README.md @@ -1,12 +1,12 @@ super simple example of image clustering with image hashes ([wavelet hashing](https://fullstackml.com/wavelet-image-hash-in-python-3504fdd282b5) in particular) and DBSCAN. -this is an unrealistically simple example, but this clusters "brain expanding" memes from "political compass" memes pretty well. - install deps: `pip3 install -r requirements.txt` -went with wavelet hashing because of the results of my unscientific experimentation (see `tests.py`) - -to run: `python3 server.py` +to run: + - store all images in `static/img/` directory + - empty `data` directory if u wish to do a re-run of clustering + - run `python3 server.py --server` to do an interactive run and decide the cluster distance and min-samples count. + - run without `--server` argument to move the images to `clusters` directory ``` usage: server.py [-h] [-s] [--cluster distance] [--min-samples count] diff --git a/requirements.txt b/requirements.txt index 1ffe0bb..955d3bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ ImageHash scikit-learn numpy -flask +flask \ No newline at end of file diff --git a/tests.py b/tests.py deleted file mode 100644 index a845b83..0000000 --- a/tests.py +++ /dev/null @@ -1,33 +0,0 @@ -import imagehash -from PIL import Image -from glob import glob -from itertools import combinations - -imgs = [(Image.open(fname), fname) for fname in glob('example/*.jpg')] - -for name, hashfunc in [ - ('phash', imagehash.phash), - ('ahash', imagehash.average_hash), - ('dhash', imagehash.dhash), - ('whash', imagehash.whash), - ('whash-db4', lambda img: imagehash.whash(img, mode='db4')) -]: - print('hashfunc:', name) - hashes = [(hashfunc(im), fname) for im, fname in imgs] - pairs = combinations(hashes, 2) - true_sim_dists = [] - true_dif_dists = [] - for (a, a_n), (b, b_n) in pairs: - # pc=political compass - if 'pc' in a_n or 'pc' in b_n: - true_dif_dists.append(a - b) - else: - true_sim_dists.append(a - b) - print(a_n, b_n, ':', a - b) - - # want the `max sim dist` to be less than - # the `min dif dist`. the bigger the gap, the better - print('min dif dist:', min(true_dif_dists)) - print('max sim dist:', max(true_sim_dists)) - print('clean split?:', max(true_sim_dists) < min(true_dif_dists)) - print('---') \ No newline at end of file From 06c69d9b327be2fdf9e416dcaf36889bea9f791a Mon Sep 17 00:00:00 2001 From: arjun v Date: Thu, 19 Jul 2018 13:05:58 +0530 Subject: [PATCH 6/8] Adding subdirectories --- .gitignore | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index b5934e4..08131f9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,8 @@ .directory -data/ -static/img/ -clusters/ \ No newline at end of file +data/* +static/img/* +clusters/* + +!static/img +!data +!clusters \ No newline at end of file From 7a06698f3e999380bea5a92bb487110f3aafee11 Mon Sep 17 00:00:00 2001 From: arjun v Date: Thu, 19 Jul 2018 13:10:21 +0530 Subject: [PATCH 7/8] Adding subdirectories --- .gitignore | 9 ++------- static/img/.gitignore | 2 ++ 2 files changed, 4 insertions(+), 7 deletions(-) create mode 100644 static/img/.gitignore diff --git a/.gitignore b/.gitignore index 08131f9..51e826a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,3 @@ .directory -data/* -static/img/* -clusters/* - -!static/img -!data -!clusters \ No newline at end of file +*.pyc +__pycache__/ \ No newline at end of file diff --git a/static/img/.gitignore b/static/img/.gitignore new file mode 100644 index 0000000..c96a04f --- /dev/null +++ b/static/img/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file From 87071f6e142ab04ab8586245d0d8a1691290a7e0 Mon Sep 17 00:00:00 2001 From: arjun v Date: Thu, 19 Jul 2018 13:11:13 +0530 Subject: [PATCH 8/8] Adding subdirectories --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 51e826a..f0d571f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .directory *.pyc -__pycache__/ \ No newline at end of file +__pycache__/ +data/ +clusters/ \ No newline at end of file