From 8a9aeaa485004f55094645aa6c75790994e4be67 Mon Sep 17 00:00:00 2001 From: "Runzhou Li (woozyking)" Date: Wed, 9 Jun 2021 18:39:27 -0400 Subject: [PATCH 1/3] nlp - fasttext unsupervised word vector + cosine similarity --- nlp/.gitignore | 138 +++++++++ nlp/Pipfile | 13 + nlp/Pipfile.lock | 667 +++++++++++++++++++++++++++++++++++++++++++ nlp/data/.gitkeep | 0 nlp/similarity.ipynb | 289 +++++++++++++++++++ 5 files changed, 1107 insertions(+) create mode 100644 nlp/.gitignore create mode 100644 nlp/Pipfile create mode 100644 nlp/Pipfile.lock create mode 100644 nlp/data/.gitkeep create mode 100644 nlp/similarity.ipynb diff --git a/nlp/.gitignore b/nlp/.gitignore new file mode 100644 index 0000000..a81c8ee --- /dev/null +++ b/nlp/.gitignore @@ -0,0 +1,138 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ diff --git a/nlp/Pipfile b/nlp/Pipfile new file mode 100644 index 0000000..19986f6 --- /dev/null +++ b/nlp/Pipfile @@ -0,0 +1,13 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +fasttext = "*" + +[dev-packages] +jupyter = "*" + +[requires] +python_version = "3.9" diff --git a/nlp/Pipfile.lock b/nlp/Pipfile.lock new file mode 100644 index 0000000..d32e12c --- /dev/null +++ b/nlp/Pipfile.lock @@ -0,0 +1,667 @@ +{ + "_meta": { + "hash": { + "sha256": "78eabcd1db3ca4798dd730caaaf351298c7ad2a99fbd504b5f41272aa9704cfc" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.9" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "fasttext": { + "hashes": [ + "sha256:665556f1f6dcb4fcbe25fa8ebcd4f71b18fa96a090de09d88d97a60cbd29dcb5" + ], + "index": "pypi", + "version": "==0.9.2" + }, + "numpy": { + "hashes": [ + "sha256:1676b0a292dd3c99e49305a16d7a9f42a4ab60ec522eac0d3dd20cdf362ac010", + "sha256:16f221035e8bd19b9dc9a57159e38d2dd060b48e93e1d843c49cb370b0f415fd", + "sha256:43909c8bb289c382170e0282158a38cf306a8ad2ff6dfadc447e90f9961bef43", + "sha256:4e465afc3b96dbc80cf4a5273e5e2b1e3451286361b4af70ce1adb2984d392f9", + "sha256:55b745fca0a5ab738647d0e4db099bd0a23279c32b31a783ad2ccea729e632df", + "sha256:5d050e1e4bc9ddb8656d7b4f414557720ddcca23a5b88dd7cff65e847864c400", + "sha256:637d827248f447e63585ca3f4a7d2dfaa882e094df6cfa177cc9cf9cd6cdf6d2", + "sha256:6690080810f77485667bfbff4f69d717c3be25e5b11bb2073e76bb3f578d99b4", + "sha256:66fbc6fed94a13b9801fb70b96ff30605ab0a123e775a5e7a26938b717c5d71a", + "sha256:67d44acb72c31a97a3d5d33d103ab06d8ac20770e1c5ad81bdb3f0c086a56cf6", + "sha256:6ca2b85a5997dabc38301a22ee43c82adcb53ff660b89ee88dded6b33687e1d8", + "sha256:6e51534e78d14b4a009a062641f465cfaba4fdcb046c3ac0b1f61dd97c861b1b", + "sha256:70eb5808127284c4e5c9e836208e09d685a7978b6a216db85960b1a112eeace8", + "sha256:830b044f4e64a76ba71448fce6e604c0fc47a0e54d8f6467be23749ac2cbd2fb", + "sha256:8b7bb4b9280da3b2856cb1fc425932f46fba609819ee1c62256f61799e6a51d2", + "sha256:a9c65473ebc342715cb2d7926ff1e202c26376c0dcaaee85a1fd4b8d8c1d3b2f", + "sha256:c1c09247ccea742525bdb5f4b5ceeacb34f95731647fe55774aa36557dbb5fa4", + "sha256:c5bf0e132acf7557fc9bb8ded8b53bbbbea8892f3c9a1738205878ca9434206a", + "sha256:db250fd3e90117e0312b611574cd1b3f78bec046783195075cbd7ba9c3d73f16", + "sha256:e515c9a93aebe27166ec9593411c58494fa98e5fcc219e47260d9ab8a1cc7f9f", + "sha256:e55185e51b18d788e49fe8305fd73ef4470596b33fc2c1ceb304566b99c71a69", + "sha256:ea9cff01e75a956dbee133fa8e5b68f2f92175233de2f88de3a682dd94deda65", + "sha256:f1452578d0516283c87608a5a5548b0cdde15b99650efdfd85182102ef7a7c17", + "sha256:f39a995e47cb8649673cfa0579fbdd1cdd33ea497d1728a6cb194d6252268e48" + ], + "markers": "python_version >= '3.7'", + "version": "==1.20.3" + }, + "pybind11": { + "hashes": [ + "sha256:2d8aebe1709bc367e34e3b23d8eccbf3f387ee9d5640548c6260d33b59f02405", + "sha256:d0e0aed9279656f21501243b707eb6e3b951e89e10c3271dedf3ae41c365e5ed" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==2.6.2" + } + }, + "develop": { + "appnope": { + "hashes": [ + "sha256:93aa393e9d6c54c5cd570ccadd8edad61ea0c4b9ea7a01409020c9aa019eb442", + "sha256:dd83cd4b5b460958838f6eb3000c660b1f9caf2a5b1de4264e941512f603258a" + ], + "markers": "sys_platform == 'darwin' and platform_system == 'Darwin'", + "version": "==0.1.2" + }, + "argon2-cffi": { + "hashes": [ + "sha256:05a8ac07c7026542377e38389638a8a1e9b78f1cd8439cd7493b39f08dd75fbf", + "sha256:0bf066bc049332489bb2d75f69216416329d9dc65deee127152caeb16e5ce7d5", + "sha256:18dee20e25e4be86680b178b35ccfc5d495ebd5792cd00781548d50880fee5c5", + "sha256:36320372133a003374ef4275fbfce78b7ab581440dfca9f9471be3dd9a522428", + "sha256:392c3c2ef91d12da510cfb6f9bae52512a4552573a9e27600bdb800e05905d2b", + "sha256:3aa804c0e52f208973845e8b10c70d8957c9e5a666f702793256242e9167c4e0", + "sha256:57358570592c46c420300ec94f2ff3b32cbccd10d38bdc12dc6979c4a8484fbc", + "sha256:6678bb047373f52bcff02db8afab0d2a77d83bde61cfecea7c5c62e2335cb203", + "sha256:6ea92c980586931a816d61e4faf6c192b4abce89aa767ff6581e6ddc985ed003", + "sha256:77e909cc756ef81d6abb60524d259d959bab384832f0c651ed7dcb6e5ccdbb78", + "sha256:7d455c802727710e9dfa69b74ccaab04568386ca17b0ad36350b622cd34606fe", + "sha256:8282b84ceb46b5b75c3a882b28856b8cd7e647ac71995e71b6705ec06fc232c3", + "sha256:8a84934bd818e14a17943de8099d41160da4a336bcc699bb4c394bbb9b94bd32", + "sha256:9bee3212ba4f560af397b6d7146848c32a800652301843df06b9e8f68f0f7361", + "sha256:9dfd5197852530294ecb5795c97a823839258dfd5eb9420233c7cfedec2058f2", + "sha256:b160416adc0f012fb1f12588a5e6954889510f82f698e23ed4f4fa57f12a0647", + "sha256:b94042e5dcaa5d08cf104a54bfae614be502c6f44c9c89ad1535b2ebdaacbd4c", + "sha256:ba7209b608945b889457f949cc04c8e762bed4fe3fec88ae9a6b7765ae82e496", + "sha256:cc0e028b209a5483b6846053d5fd7165f460a1f14774d79e632e75e7ae64b82b", + "sha256:d8029b2d3e4b4cea770e9e5a0104dd8fa185c1724a0f01528ae4826a6d25f97d", + "sha256:da7f0445b71db6d3a72462e04f36544b0de871289b0bc8a7cc87c0f5ec7079fa", + "sha256:e2db6e85c057c16d0bd3b4d2b04f270a7467c147381e8fd73cbbe5bc719832be" + ], + "version": "==20.1.0" + }, + "async-generator": { + "hashes": [ + "sha256:01c7bf666359b4967d2cda0000cc2e4af16a0ae098cbffcb8472fb9e8ad6585b", + "sha256:6ebb3d106c12920aaae42ccb6f787ef5eefdcdd166ea3d628fa8476abe712144" + ], + "markers": "python_version >= '3.5'", + "version": "==1.10" + }, + "attrs": { + "hashes": [ + "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1", + "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==21.2.0" + }, + "backcall": { + "hashes": [ + "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e", + "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255" + ], + "version": "==0.2.0" + }, + "bleach": { + "hashes": [ + "sha256:6123ddc1052673e52bab52cdc955bcb57a015264a1c57d37bea2f6b817af0125", + "sha256:98b3170739e5e83dd9dc19633f074727ad848cbedb6026708c8ac2d3b697a433" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==3.3.0" + }, + "cffi": { + "hashes": [ + "sha256:005a36f41773e148deac64b08f233873a4d0c18b053d37da83f6af4d9087b813", + "sha256:04c468b622ed31d408fea2346bec5bbffba2cc44226302a0de1ade9f5ea3d373", + "sha256:06d7cd1abac2ffd92e65c0609661866709b4b2d82dd15f611e602b9b188b0b69", + "sha256:06db6321b7a68b2bd6df96d08a5adadc1fa0e8f419226e25b2a5fbf6ccc7350f", + "sha256:0857f0ae312d855239a55c81ef453ee8fd24136eaba8e87a2eceba644c0d4c06", + "sha256:0f861a89e0043afec2a51fd177a567005847973be86f709bbb044d7f42fc4e05", + "sha256:1071534bbbf8cbb31b498d5d9db0f274f2f7a865adca4ae429e147ba40f73dea", + "sha256:158d0d15119b4b7ff6b926536763dc0714313aa59e320ddf787502c70c4d4bee", + "sha256:1bf1ac1984eaa7675ca8d5745a8cb87ef7abecb5592178406e55858d411eadc0", + "sha256:1f436816fc868b098b0d63b8920de7d208c90a67212546d02f84fe78a9c26396", + "sha256:24a570cd11895b60829e941f2613a4f79df1a27344cbbb82164ef2e0116f09c7", + "sha256:24ec4ff2c5c0c8f9c6b87d5bb53555bf267e1e6f70e52e5a9740d32861d36b6f", + "sha256:2894f2df484ff56d717bead0a5c2abb6b9d2bf26d6960c4604d5c48bbc30ee73", + "sha256:29314480e958fd8aab22e4a58b355b629c59bf5f2ac2492b61e3dc06d8c7a315", + "sha256:293e7ea41280cb28c6fcaaa0b1aa1f533b8ce060b9e701d78511e1e6c4a1de76", + "sha256:34eff4b97f3d982fb93e2831e6750127d1355a923ebaeeb565407b3d2f8d41a1", + "sha256:35f27e6eb43380fa080dccf676dece30bef72e4a67617ffda586641cd4508d49", + "sha256:3c3f39fa737542161d8b0d680df2ec249334cd70a8f420f71c9304bd83c3cbed", + "sha256:3d3dd4c9e559eb172ecf00a2a7517e97d1e96de2a5e610bd9b68cea3925b4892", + "sha256:43e0b9d9e2c9e5d152946b9c5fe062c151614b262fda2e7b201204de0b99e482", + "sha256:48e1c69bbacfc3d932221851b39d49e81567a4d4aac3b21258d9c24578280058", + "sha256:51182f8927c5af975fece87b1b369f722c570fe169f9880764b1ee3bca8347b5", + "sha256:58e3f59d583d413809d60779492342801d6e82fefb89c86a38e040c16883be53", + "sha256:5de7970188bb46b7bf9858eb6890aad302577a5f6f75091fd7cdd3ef13ef3045", + "sha256:65fa59693c62cf06e45ddbb822165394a288edce9e276647f0046e1ec26920f3", + "sha256:681d07b0d1e3c462dd15585ef5e33cb021321588bebd910124ef4f4fb71aef55", + "sha256:69e395c24fc60aad6bb4fa7e583698ea6cc684648e1ffb7fe85e3c1ca131a7d5", + "sha256:6c97d7350133666fbb5cf4abdc1178c812cb205dc6f41d174a7b0f18fb93337e", + "sha256:6e4714cc64f474e4d6e37cfff31a814b509a35cb17de4fb1999907575684479c", + "sha256:72d8d3ef52c208ee1c7b2e341f7d71c6fd3157138abf1a95166e6165dd5d4369", + "sha256:8ae6299f6c68de06f136f1f9e69458eae58f1dacf10af5c17353eae03aa0d827", + "sha256:8b198cec6c72df5289c05b05b8b0969819783f9418e0409865dac47288d2a053", + "sha256:99cd03ae7988a93dd00bcd9d0b75e1f6c426063d6f03d2f90b89e29b25b82dfa", + "sha256:9cf8022fb8d07a97c178b02327b284521c7708d7c71a9c9c355c178ac4bbd3d4", + "sha256:9de2e279153a443c656f2defd67769e6d1e4163952b3c622dcea5b08a6405322", + "sha256:9e93e79c2551ff263400e1e4be085a1210e12073a31c2011dbbda14bda0c6132", + "sha256:9ff227395193126d82e60319a673a037d5de84633f11279e336f9c0f189ecc62", + "sha256:a465da611f6fa124963b91bf432d960a555563efe4ed1cc403ba5077b15370aa", + "sha256:ad17025d226ee5beec591b52800c11680fca3df50b8b29fe51d882576e039ee0", + "sha256:afb29c1ba2e5a3736f1c301d9d0abe3ec8b86957d04ddfa9d7a6a42b9367e396", + "sha256:b85eb46a81787c50650f2392b9b4ef23e1f126313b9e0e9013b35c15e4288e2e", + "sha256:bb89f306e5da99f4d922728ddcd6f7fcebb3241fc40edebcb7284d7514741991", + "sha256:cbde590d4faaa07c72bf979734738f328d239913ba3e043b1e98fe9a39f8b2b6", + "sha256:cc5a8e069b9ebfa22e26d0e6b97d6f9781302fe7f4f2b8776c3e1daea35f1adc", + "sha256:cd2868886d547469123fadc46eac7ea5253ea7fcb139f12e1dfc2bbd406427d1", + "sha256:d42b11d692e11b6634f7613ad8df5d6d5f8875f5d48939520d351007b3c13406", + "sha256:df5052c5d867c1ea0b311fb7c3cd28b19df469c056f7fdcfe88c7473aa63e333", + "sha256:f2d45f97ab6bb54753eab54fffe75aaf3de4ff2341c9daee1987ee1837636f1d", + "sha256:fd78e5fee591709f32ef6edb9a015b4aa1a5022598e36227500c8f4e02328d9c" + ], + "version": "==1.14.5" + }, + "decorator": { + "hashes": [ + "sha256:6e5c199c16f7a9f0e3a61a4a54b3d27e7dad0dbdde92b944426cb20914376323", + "sha256:72ecfba4320a893c53f9706bebb2d55c270c1e51a28789361aa93e4a21319ed5" + ], + "markers": "python_version >= '3.5'", + "version": "==5.0.9" + }, + "defusedxml": { + "hashes": [ + "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", + "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==0.7.1" + }, + "entrypoints": { + "hashes": [ + "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19", + "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451" + ], + "markers": "python_version >= '2.7'", + "version": "==0.3" + }, + "ipykernel": { + "hashes": [ + "sha256:29eee66548ee7c2edb7941de60c0ccf0a7a8dd957341db0a49c5e8e6a0fcb712", + "sha256:e976751336b51082a89fc2099fb7f96ef20f535837c398df6eab1283c2070884" + ], + "markers": "python_version >= '3.5'", + "version": "==5.5.5" + }, + "ipython": { + "hashes": [ + "sha256:9bc24a99f5d19721fb8a2d1408908e9c0520a17fff2233ffe82620847f17f1b6", + "sha256:d513e93327cf8657d6467c81f1f894adc125334ffe0e4ddd1abbb1c78d828703" + ], + "markers": "python_version >= '3.3'", + "version": "==7.24.1" + }, + "ipython-genutils": { + "hashes": [ + "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8", + "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8" + ], + "version": "==0.2.0" + }, + "ipywidgets": { + "hashes": [ + "sha256:9f1a43e620530f9e570e4a493677d25f08310118d315b00e25a18f12913c41f0", + "sha256:e6513cfdaf5878de30f32d57f6dc2474da395a2a2991b94d487406c0ab7f55ca" + ], + "version": "==7.6.3" + }, + "jedi": { + "hashes": [ + "sha256:18456d83f65f400ab0c2d3319e48520420ef43b23a086fdc05dff34132f0fb93", + "sha256:92550a404bad8afed881a137ec9a461fed49eca661414be45059329614ed0707" + ], + "markers": "python_version >= '3.6'", + "version": "==0.18.0" + }, + "jinja2": { + "hashes": [ + "sha256:1f06f2da51e7b56b8f238affdd6b4e2c61e39598a378cc49345bc1bd42a978a4", + "sha256:703f484b47a6af502e743c9122595cc812b0271f661722403114f71a79d0f5a4" + ], + "markers": "python_version >= '3.6'", + "version": "==3.0.1" + }, + "jsonschema": { + "hashes": [ + "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163", + "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a" + ], + "version": "==3.2.0" + }, + "jupyter": { + "hashes": [ + "sha256:3e1f86076bbb7c8c207829390305a2b1fe836d471ed54be66a3b8c41e7f46cc7", + "sha256:5b290f93b98ffbc21c0c7e749f054b3267782166d72fa5e3ed1ed4eaf34a2b78", + "sha256:d9dc4b3318f310e34c82951ea5d6683f67bed7def4b259fafbfe4f1beb1d8e5f" + ], + "index": "pypi", + "version": "==1.0.0" + }, + "jupyter-client": { + "hashes": [ + "sha256:c4bca1d0846186ca8be97f4d2fa6d2bae889cce4892a167ffa1ba6bd1f73e782", + "sha256:e053a2c44b6fa597feebe2b3ecb5eea3e03d1d91cc94351a52931ee1426aecfc" + ], + "markers": "python_version >= '3.5'", + "version": "==6.1.12" + }, + "jupyter-console": { + "hashes": [ + "sha256:242248e1685039cd8bff2c2ecb7ce6c1546eb50ee3b08519729e6e881aec19c7", + "sha256:7799c4ea951e0e96ba8260575423cb323ea5a03fcf5503560fa3e15748869e27" + ], + "markers": "python_version >= '3.6'", + "version": "==6.4.0" + }, + "jupyter-core": { + "hashes": [ + "sha256:79025cb3225efcd36847d0840f3fc672c0abd7afd0de83ba8a1d3837619122b4", + "sha256:8c6c0cac5c1b563622ad49321d5ec47017bd18b94facb381c6973a0486395f8e" + ], + "markers": "python_version >= '3.6'", + "version": "==4.7.1" + }, + "jupyterlab-pygments": { + "hashes": [ + "sha256:abfb880fd1561987efaefcb2d2ac75145d2a5d0139b1876d5be806e32f630008", + "sha256:cfcda0873626150932f438eccf0f8bf22bfa92345b814890ab360d666b254146" + ], + "version": "==0.1.2" + }, + "jupyterlab-widgets": { + "hashes": [ + "sha256:5c1a29a84d3069208cb506b10609175b249b6486d6b1cbae8fcde2a11584fb78", + "sha256:caeaf3e6103180e654e7d8d2b81b7d645e59e432487c1d35a41d6d3ee56b3fef" + ], + "markers": "python_version >= '3.6'", + "version": "==1.0.0" + }, + "markupsafe": { + "hashes": [ + "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298", + "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64", + "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b", + "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567", + "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff", + "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74", + "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35", + "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26", + "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7", + "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75", + "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f", + "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135", + "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8", + "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a", + "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914", + "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18", + "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8", + "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2", + "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d", + "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b", + "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f", + "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb", + "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833", + "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415", + "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902", + "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9", + "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d", + "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066", + "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f", + "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5", + "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94", + "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509", + "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51", + "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872" + ], + "markers": "python_version >= '3.6'", + "version": "==2.0.1" + }, + "matplotlib-inline": { + "hashes": [ + "sha256:5cf1176f554abb4fa98cb362aa2b55c500147e4bdbb07e3fda359143e1da0811", + "sha256:f41d5ff73c9f5385775d5c0bc13b424535c8402fe70ea8210f93e11f3683993e" + ], + "markers": "python_version >= '3.5'", + "version": "==0.1.2" + }, + "mistune": { + "hashes": [ + "sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e", + "sha256:88a1051873018da288eee8538d476dffe1262495144b33ecb586c4ab266bb8d4" + ], + "version": "==0.8.4" + }, + "nbclient": { + "hashes": [ + "sha256:db17271330c68c8c88d46d72349e24c147bb6f34ec82d8481a8f025c4d26589c", + "sha256:e79437364a2376892b3f46bedbf9b444e5396cfb1bc366a472c37b48e9551500" + ], + "markers": "python_full_version >= '3.6.1'", + "version": "==0.5.3" + }, + "nbconvert": { + "hashes": [ + "sha256:39e9f977920b203baea0be67eea59f7b37a761caa542abe80f5897ce3cf6311d", + "sha256:cbbc13a86dfbd4d1b5dee106539de0795b4db156c894c2c5dc382062bbc29002" + ], + "markers": "python_version >= '3.6'", + "version": "==6.0.7" + }, + "nbformat": { + "hashes": [ + "sha256:b516788ad70771c6250977c1374fcca6edebe6126fd2adb5a69aa5c2356fd1c8", + "sha256:eb8447edd7127d043361bc17f2f5a807626bc8e878c7709a1c647abda28a9171" + ], + "markers": "python_version >= '3.5'", + "version": "==5.1.3" + }, + "nest-asyncio": { + "hashes": [ + "sha256:76d6e972265063fe92a90b9cc4fb82616e07d586b346ed9d2c89a4187acea39c", + "sha256:afc5a1c515210a23c461932765691ad39e8eba6551c055ac8d5546e69250d0aa" + ], + "markers": "python_version >= '3.5'", + "version": "==1.5.1" + }, + "notebook": { + "hashes": [ + "sha256:9c4625e2a2aa49d6eae4ce20cbc3d8976db19267e32d2a304880e0c10bf8aef9", + "sha256:f7f0a71a999c7967d9418272ae4c3378a220bd28330fbfb49860e46cf8a5838a" + ], + "markers": "python_version >= '3.6'", + "version": "==6.4.0" + }, + "packaging": { + "hashes": [ + "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5", + "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==20.9" + }, + "pandocfilters": { + "hashes": [ + "sha256:bc63fbb50534b4b1f8ebe1860889289e8af94a23bff7445259592df25a3906eb" + ], + "version": "==1.4.3" + }, + "parso": { + "hashes": [ + "sha256:12b83492c6239ce32ff5eed6d3639d6a536170723c6f3f1506869f1ace413398", + "sha256:a8c4922db71e4fdb90e0d0bc6e50f9b273d3397925e5e60a717e719201778d22" + ], + "markers": "python_version >= '3.6'", + "version": "==0.8.2" + }, + "pexpect": { + "hashes": [ + "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937", + "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c" + ], + "markers": "sys_platform != 'win32'", + "version": "==4.8.0" + }, + "pickleshare": { + "hashes": [ + "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca", + "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56" + ], + "version": "==0.7.5" + }, + "prometheus-client": { + "hashes": [ + "sha256:3a8baade6cb80bcfe43297e33e7623f3118d660d41387593758e2fb1ea173a86", + "sha256:b014bc76815eb1399da8ce5fc84b7717a3e63652b0c0f8804092c9363acab1b2" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.11.0" + }, + "prompt-toolkit": { + "hashes": [ + "sha256:bf00f22079f5fadc949f42ae8ff7f05702826a97059ffcc6281036ad40ac6f04", + "sha256:e1b4f11b9336a28fa11810bc623c357420f69dfdb6d2dac41ca2c21a55c033bc" + ], + "markers": "python_full_version >= '3.6.1'", + "version": "==3.0.18" + }, + "ptyprocess": { + "hashes": [ + "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", + "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220" + ], + "markers": "os_name != 'nt'", + "version": "==0.7.0" + }, + "pycparser": { + "hashes": [ + "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0", + "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.20" + }, + "pygments": { + "hashes": [ + "sha256:a18f47b506a429f6f4b9df81bb02beab9ca21d0a5fee38ed15aef65f0545519f", + "sha256:d66e804411278594d764fc69ec36ec13d9ae9147193a1740cd34d272ca383b8e" + ], + "markers": "python_version >= '3.5'", + "version": "==2.9.0" + }, + "pyparsing": { + "hashes": [ + "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1", + "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b" + ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.4.7" + }, + "pyrsistent": { + "hashes": [ + "sha256:2e636185d9eb976a18a8a8e96efce62f2905fea90041958d8cc2a189756ebf3e" + ], + "markers": "python_version >= '3.5'", + "version": "==0.17.3" + }, + "python-dateutil": { + "hashes": [ + "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", + "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.8.1" + }, + "pyzmq": { + "hashes": [ + "sha256:089b974ec04d663b8685ac90e86bfe0e4da9d911ff3cf52cb765ff22408b102d", + "sha256:0ea7f4237991b0f745a4432c63e888450840bf8cb6c48b93fb7d62864f455529", + "sha256:0f0f27eaab9ba7b92d73d71c51d1a04464a1da6097a252d007922103253d2313", + "sha256:12ffcf33db6ba7c0e5aaf901e65517f5e2b719367b80bcbfad692f546a297c7a", + "sha256:1389b615917d4196962a9b469e947ba862a8ec6f5094a47da5e7a8d404bc07a4", + "sha256:18dd2ca4540c476558099891c129e6f94109971d110b549db2a9775c817cedbd", + "sha256:24fb5bb641f0b2aa25fc3832f4b6fc62430f14a7d328229fe994b2bcdc07c93a", + "sha256:285514956c08c7830da9d94e01f5414661a987831bd9f95e4d89cc8aaae8da10", + "sha256:41049cff5265e9cd75606aa2c90a76b9c80b98d8fe70ee08cf4af3cedb113358", + "sha256:461ed80d741692d9457ab820b1cc057ba9c37c394e67b647b639f623c8b321f6", + "sha256:4b8fb1b3174b56fd020e4b10232b1764e52cf7f3babcfb460c5253bdc48adad0", + "sha256:4c4fe69c7dc0d13d4ae180ad650bb900854367f3349d3c16f0569f6c6447f698", + "sha256:4e9b9a2f6944acdaf57316436c1acdcb30b8df76726bcf570ad9342bc5001654", + "sha256:6355f81947e1fe6e7bb9e123aeb3067264391d3ebe8402709f824ef8673fa6f3", + "sha256:68be16107f41563b9f67d93dff1c9f5587e0f76aa8fd91dc04c83d813bcdab1f", + "sha256:68e2c4505992ab5b89f976f89a9135742b18d60068f761bef994a6805f1cae0c", + "sha256:7040d6dd85ea65703904d023d7f57fab793d7ffee9ba9e14f3b897f34ff2415d", + "sha256:734ea6565c71fc2d03d5b8c7d0d7519c96bb5567e0396da1b563c24a4ac66f0c", + "sha256:9ee48413a2d3cd867fd836737b4c89c24cea1150a37f4856d82d20293fa7519f", + "sha256:a1c77796f395804d6002ff56a6a8168c1f98579896897ad7e35665a9b4a9eec5", + "sha256:b2f707b52e09098a7770503e39294ca6e22ae5138ffa1dd36248b6436d23d78e", + "sha256:bf80b2cec42d96117248b99d3c86e263a00469c840a778e6cb52d916f4fdf82c", + "sha256:c4674004ed64685a38bee222cd75afa769424ec603f9329f0dd4777138337f48", + "sha256:c6a81c9e6754465d09a87e3acd74d9bb1f0039b2d785c6899622f0afdb41d760", + "sha256:c6d0c32532a0519997e1ded767e184ebb8543bdb351f8eff8570bd461e874efc", + "sha256:c8fff75af4c7af92dce9f81fa2a83ed009c3e1f33ee8b5222db2ef80b94e242e", + "sha256:cb9f9fe1305ef69b65794655fd89b2209b11bff3e837de981820a8aa051ef914", + "sha256:d3ecfee2ee8d91ab2e08d2d8e89302c729b244e302bbc39c5b5dde42306ff003", + "sha256:d5e5be93e1714a59a535bbbc086b9e4fd2448c7547c5288548f6fd86353cad9e", + "sha256:de5806be66c9108e4dcdaced084e8ceae14100aa559e2d57b4f0cceb98c462de", + "sha256:f49755684a963731479ff3035d45a8185545b4c9f662d368bd349c419839886d", + "sha256:fc712a90401bcbf3fa25747f189d6dcfccbecc32712701cad25c6355589dac57" + ], + "markers": "python_version >= '3.6'", + "version": "==22.1.0" + }, + "qtconsole": { + "hashes": [ + "sha256:12c734494901658787339dea9bbd82f3dc0d5e394071377a1c77b4a0954d7d8b", + "sha256:3a2adecc43ff201a08972fb2179df22e7b3a08d71b9ed680f46ad1bfd4fb9132" + ], + "markers": "python_version >= '3.6'", + "version": "==5.1.0" + }, + "qtpy": { + "hashes": [ + "sha256:2db72c44b55d0fe1407be8fba35c838ad0d6d3bb81f23007886dc1fc0f459c8d", + "sha256:fa0b8363b363e89b2a6f49eddc162a04c0699ae95e109a6be3bb145a913190ea" + ], + "version": "==1.9.0" + }, + "send2trash": { + "hashes": [ + "sha256:60001cc07d707fe247c94f74ca6ac0d3255aabcb930529690897ca2a39db28b2", + "sha256:f1691922577b6fa12821234aeb57599d887c4900b9ca537948d2dac34aea888b" + ], + "version": "==1.5.0" + }, + "six": { + "hashes": [ + "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", + "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.16.0" + }, + "terminado": { + "hashes": [ + "sha256:048ce7b271ad1f94c48130844af1de163e54913b919f8c268c89b36a6d468d7c", + "sha256:46fd07c9dc7db7321922270d544a1f18eaa7a02fd6cd4438314f27a687cabbea" + ], + "markers": "python_version >= '3.6'", + "version": "==0.10.0" + }, + "testpath": { + "hashes": [ + "sha256:1acf7a0bcd3004ae8357409fc33751e16d37ccc650921da1094a86581ad1e417", + "sha256:8044f9a0bab6567fc644a3593164e872543bb44225b0e24846e2c89237937589" + ], + "markers": "python_version >= '3.5'", + "version": "==0.5.0" + }, + "tornado": { + "hashes": [ + "sha256:0a00ff4561e2929a2c37ce706cb8233b7907e0cdc22eab98888aca5dd3775feb", + "sha256:0d321a39c36e5f2c4ff12b4ed58d41390460f798422c4504e09eb5678e09998c", + "sha256:1e8225a1070cd8eec59a996c43229fe8f95689cb16e552d130b9793cb570a288", + "sha256:20241b3cb4f425e971cb0a8e4ffc9b0a861530ae3c52f2b0434e6c1b57e9fd95", + "sha256:25ad220258349a12ae87ede08a7b04aca51237721f63b1808d39bdb4b2164558", + "sha256:33892118b165401f291070100d6d09359ca74addda679b60390b09f8ef325ffe", + "sha256:33c6e81d7bd55b468d2e793517c909b139960b6c790a60b7991b9b6b76fb9791", + "sha256:3447475585bae2e77ecb832fc0300c3695516a47d46cefa0528181a34c5b9d3d", + "sha256:34ca2dac9e4d7afb0bed4677512e36a52f09caa6fded70b4e3e1c89dbd92c326", + "sha256:3e63498f680547ed24d2c71e6497f24bca791aca2fe116dbc2bd0ac7f191691b", + "sha256:548430be2740e327b3fe0201abe471f314741efcb0067ec4f2d7dcfb4825f3e4", + "sha256:6196a5c39286cc37c024cd78834fb9345e464525d8991c21e908cc046d1cc02c", + "sha256:61b32d06ae8a036a6607805e6720ef00a3c98207038444ba7fd3d169cd998910", + "sha256:6286efab1ed6e74b7028327365cf7346b1d777d63ab30e21a0f4d5b275fc17d5", + "sha256:65d98939f1a2e74b58839f8c4dab3b6b3c1ce84972ae712be02845e65391ac7c", + "sha256:66324e4e1beede9ac79e60f88de548da58b1f8ab4b2f1354d8375774f997e6c0", + "sha256:6c77c9937962577a6a76917845d06af6ab9197702a42e1346d8ae2e76b5e3675", + "sha256:70dec29e8ac485dbf57481baee40781c63e381bebea080991893cd297742b8fd", + "sha256:7250a3fa399f08ec9cb3f7b1b987955d17e044f1ade821b32e5f435130250d7f", + "sha256:748290bf9112b581c525e6e6d3820621ff020ed95af6f17fedef416b27ed564c", + "sha256:7da13da6f985aab7f6f28debab00c67ff9cbacd588e8477034c0652ac141feea", + "sha256:8f959b26f2634a091bb42241c3ed8d3cedb506e7c27b8dd5c7b9f745318ddbb6", + "sha256:9de9e5188a782be6b1ce866e8a51bc76a0fbaa0e16613823fc38e4fc2556ad05", + "sha256:a48900ecea1cbb71b8c71c620dee15b62f85f7c14189bdeee54966fbd9a0c5bd", + "sha256:b87936fd2c317b6ee08a5741ea06b9d11a6074ef4cc42e031bc6403f82a32575", + "sha256:c77da1263aa361938476f04c4b6c8916001b90b2c2fdd92d8d535e1af48fba5a", + "sha256:cb5ec8eead331e3bb4ce8066cf06d2dfef1bfb1b2a73082dfe8a161301b76e37", + "sha256:cc0ee35043162abbf717b7df924597ade8e5395e7b66d18270116f8745ceb795", + "sha256:d14d30e7f46a0476efb0deb5b61343b1526f73ebb5ed84f23dc794bdb88f9d9f", + "sha256:d371e811d6b156d82aa5f9a4e08b58debf97c302a35714f6f45e35139c332e32", + "sha256:d3d20ea5782ba63ed13bc2b8c291a053c8d807a8fa927d941bd718468f7b950c", + "sha256:d3f7594930c423fd9f5d1a76bee85a2c36fd8b4b16921cae7e965f22575e9c01", + "sha256:dcef026f608f678c118779cd6591c8af6e9b4155c44e0d1bc0c87c036fb8c8c4", + "sha256:e0791ac58d91ac58f694d8d2957884df8e4e2f6687cdf367ef7eb7497f79eaa2", + "sha256:e385b637ac3acaae8022e7e47dfa7b83d3620e432e3ecb9a3f7f58f150e50921", + "sha256:e519d64089b0876c7b467274468709dadf11e41d65f63bba207e04217f47c085", + "sha256:e7229e60ac41a1202444497ddde70a48d33909e484f96eb0da9baf8dc68541df", + "sha256:ed3ad863b1b40cd1d4bd21e7498329ccaece75db5a5bf58cd3c9f130843e7102", + "sha256:f0ba29bafd8e7e22920567ce0d232c26d4d47c8b5cf4ed7b562b5db39fa199c5", + "sha256:fa2ba70284fa42c2a5ecb35e322e68823288a4251f9ba9cc77be04ae15eada68", + "sha256:fba85b6cd9c39be262fcd23865652920832b61583de2a2ca907dbd8e8a8c81e5" + ], + "markers": "python_version >= '3.5'", + "version": "==6.1" + }, + "traitlets": { + "hashes": [ + "sha256:178f4ce988f69189f7e523337a3e11d91c786ded9360174a3d9ca83e79bc5396", + "sha256:69ff3f9d5351f31a7ad80443c2674b7099df13cc41fc5fa6e2f6d3b0330b0426" + ], + "markers": "python_version >= '3.7'", + "version": "==5.0.5" + }, + "wcwidth": { + "hashes": [ + "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784", + "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83" + ], + "version": "==0.2.5" + }, + "webencodings": { + "hashes": [ + "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", + "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923" + ], + "version": "==0.5.1" + }, + "widgetsnbextension": { + "hashes": [ + "sha256:079f87d87270bce047512400efd70238820751a11d2d8cb137a5a5bdbaf255c7", + "sha256:bd314f8ceb488571a5ffea6cc5b9fc6cba0adaf88a9d2386b93a489751938bcd" + ], + "version": "==3.5.1" + } + } +} diff --git a/nlp/data/.gitkeep b/nlp/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/nlp/similarity.ipynb b/nlp/similarity.ipynb new file mode 100644 index 0000000..220b160 --- /dev/null +++ b/nlp/similarity.ipynb @@ -0,0 +1,289 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "956b2884-ac87-4149-b753-aab18a6a51af", + "metadata": {}, + "outputs": [], + "source": [ + "import fasttext" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e75dbff3", + "metadata": {}, + "outputs": [], + "source": [ + "# english wiki first 1 billion bytes\n", + "# https://fasttext.cc/docs/en/unsupervised-tutorial.html\n", + "model_path = './wiki_1b.bin'\n", + "# model = fasttext.train_unsupervised('data/fil9')\n", + "# model.save_model('wiki_1b.bin')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "89dd445e-d362-4025-884e-173312194877", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n" + ] + } + ], + "source": [ + "model = fasttext.load_model(model_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c98b13eb-f59e-4949-83de-27809c214d93", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "933.15 MB\n" + ] + } + ], + "source": [ + "import os\n", + " \n", + "file_size = os.path.getsize('./wiki_1b.bin')\n", + "print(f'{round(file_size / 1024 / 1024, 2)} MB')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7f907a96-60e1-45d2-bbe7-0771740a3769", + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "For now we only support quantization of supervised models", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquantize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# unsupervised word embedding cannot be quantized\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# This means that we have to allow loading of the model on-demand instead bundling like the supervised model for release notes classification\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/share/virtualenvs/nlp-qPCzfwe0/lib/python3.9/site-packages/fasttext/FastText.py\u001b[0m in \u001b[0;36mquantize\u001b[0;34m(self, input, qout, cutoff, retrain, epoch, lr, thread, verbose, dsub, qnorm)\u001b[0m\n\u001b[1;32m 356\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minput\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 357\u001b[0m \u001b[0minput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 358\u001b[0;31m self.f.quantize(\n\u001b[0m\u001b[1;32m 359\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mqout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcutoff\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepoch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdsub\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[0mqnorm\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: For now we only support quantization of supervised models" + ] + } + ], + "source": [ + "model.quantize() # unsupervised word embedding cannot be quantized\n", + "\n", + "# This means that we have to allow loading of the model on-demand instead bundling like the supervised model for release notes classification" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "548bd065-d956-4c8e-9b6f-25b5ec63c950", + "metadata": {}, + "outputs": [], + "source": [ + "# https://masongallo.github.io/machine/learning,/python/2016/07/29/cosine-similarity.html\n", + "import numpy as np\n", + "\n", + "def cos_sim(a, b):\n", + " \"\"\"Takes 2 vectors a, b and returns the cosine similarity according \n", + " to the definition of the dot product\n", + " \"\"\"\n", + " dot_product = np.dot(a, b)\n", + " norm_a = np.linalg.norm(a)\n", + " norm_b = np.linalg.norm(b)\n", + " return dot_product / (norm_a * norm_b)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a9399c71-09f3-428a-8694-c9bf58ece0a1", + "metadata": {}, + "outputs": [], + "source": [ + "def sim(model, a, b):\n", + " if type(model) is str:\n", + " model = fasttext.load_model(model)\n", + "\n", + " return cos_sim(*map(model.get_sentence_vector, (a, b)))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1760f1bd-46d7-47b5-adc8-b7537c851571", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n" + ] + }, + { + "data": { + "text/plain": [ + "0.9222608" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sim(\n", + " './wiki_1b.bin',\n", + " 'fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',\n", + " 'fix issue on loading beacons for VWI job creation',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c902ff25-1ea5-4232-b1c6-8a8fa8d9a92f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9222608" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# prefix stripped\n", + "sim(\n", + " model,\n", + " 'fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',\n", + " 'fix issue on loading beacons for VWI job creation',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "368f5b45-b38a-44f7-a6c9-58f47d46fe81", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9264801" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# prefix intact\n", + "sim(\n", + " model,\n", + " 'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',\n", + " 'builder - fix issue on loading beacons for VWI job creation',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "30ac8064-509d-4115-be3e-02ec809b4d9b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.79175115" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sim(\n", + " model,\n", + " 'modal - explore popstate events & add default onbeforeunload alert if there are unsaved changes',\n", + " 'builder - fix issue on loading beacons for VWI job creation',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "bb87108c-a618-481c-a01f-d43151fb08a1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.86247617" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sim(\n", + " model,\n", + " 'react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master',\n", + " 'Map/GeoCohortMap - add setViewportBBox , setApiBBox'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9dd0237e-b982-40cc-a072-81f6cdaca54b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From dc0119d8ca030769aa158360fd2ec677af775247 Mon Sep 17 00:00:00 2001 From: "Runzhou Li (woozyking)" Date: Thu, 10 Jun 2021 11:37:48 -0400 Subject: [PATCH 2/3] nlp - normalization and larger sample test run (~17% dedupe) --- nlp/similarity.ipynb | 1729 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 1720 insertions(+), 9 deletions(-) diff --git a/nlp/similarity.ipynb b/nlp/similarity.ipynb index 220b160..333be09 100644 --- a/nlp/similarity.ipynb +++ b/nlp/similarity.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 11, "id": "956b2884-ac87-4149-b753-aab18a6a51af", "metadata": {}, "outputs": [], @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 12, "id": "e75dbff3", "metadata": {}, "outputs": [], @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 13, "id": "89dd445e-d362-4025-884e-173312194877", "metadata": {}, "outputs": [ @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 14, "id": "c98b13eb-f59e-4949-83de-27809c214d93", "metadata": {}, "outputs": [ @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 15, "id": "7f907a96-60e1-45d2-bbe7-0771740a3769", "metadata": {}, "outputs": [ @@ -76,7 +76,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquantize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# unsupervised word embedding cannot be quantized\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# This means that we have to allow loading of the model on-demand instead bundling like the supervised model for release notes classification\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquantize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# unsupervised word embedding cannot be quantized\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# This means that we have to allow loading of the model on-demand instead bundling like the supervised model for release notes classification\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/share/virtualenvs/nlp-qPCzfwe0/lib/python3.9/site-packages/fasttext/FastText.py\u001b[0m in \u001b[0;36mquantize\u001b[0;34m(self, input, qout, cutoff, retrain, epoch, lr, thread, verbose, dsub, qnorm)\u001b[0m\n\u001b[1;32m 356\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minput\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 357\u001b[0m \u001b[0minput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 358\u001b[0;31m self.f.quantize(\n\u001b[0m\u001b[1;32m 359\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mqout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcutoff\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepoch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdsub\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[0mqnorm\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: For now we only support quantization of supervised models" ] @@ -90,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 17, "id": "548bd065-d956-4c8e-9b6f-25b5ec63c950", "metadata": {}, "outputs": [], @@ -110,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 18, "id": "a9399c71-09f3-428a-8694-c9bf58ece0a1", "metadata": {}, "outputs": [], @@ -258,9 +258,1720 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "9dd0237e-b982-40cc-a072-81f6cdaca54b", "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/woozyking/.local/share/virtualenvs/nlp-qPCzfwe0/lib/python3.9/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", + " warnings.warn(msg)\n" + ] + } + ], + "source": [ + "# clear stopwords using gensim preprocessing collection\n", + "from gensim.parsing.preprocessing import STOPWORDS" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3287c2df", + "metadata": {}, + "outputs": [], + "source": [ + "def normalize(s):\n", + " s = s.lower().split()\n", + " s = [w for w in s if w not in STOPWORDS]\n", + " return ' '.join(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "cde36f02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'react-maps - merge [g2m] map/geocohortmap - add setviewportbbox , setapibbox #76 master'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "normalize('react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "e85d1709", + "metadata": {}, + "outputs": [], + "source": [ + "def sim_norm(model, a, b, normalizations = (normalize, model.get_sentence_vector)):\n", + " if type(model) is str:\n", + " model = fasttext.load_model(model)\n", + "\n", + " for fn in normalizations:\n", + " a, b = map(fn, (a, b))\n", + "\n", + " return cos_sim(a, b)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "755cfb18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.91396576" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sim_norm(\n", + " model,\n", + " 'react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master',\n", + " 'Map/GeoCohortMap - add setViewportBBox , setApiBBox'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "616a5181", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.79462" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sim_norm(\n", + " model,\n", + " 'modal - explore popstate events & add default onbeforeunload alert if there are unsaved changes',\n", + " 'builder - fix issue on loading beacons for VWI job creation',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "2005c41b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.93234414" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sim_norm(\n", + " model,\n", + " 'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',\n", + " 'builder - fix issue on loading beacons for VWI job creation',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "dfd47891", + "metadata": {}, + "outputs": [], + "source": [ + "prs = [\n", + " pr.strip() for pr in '''\n", + " builder - fix issue on loading beacons for VWI job creation\n", + " Hub - change connection type to direct when doing direct upload\n", + " QL / New Taxonomies + Missing views + Layer views\n", + " Popular Times - add ConscientAI for access\n", + " Hub - add direct upload as connection type\n", + " DevOps/Fix - Update cluster names\n", + " New Airflow cluster + task priority + sensor to reschedule\n", + " Camp/Insights/GeoCohort - Add bounded box filtering\n", + " Athena/BeaconConv - Fix - SQL dict to json\n", + " Adserver/Beacons - Add support for custom content as url params\n", + " Report/GeoCohort - use bbox coords to pull postal code level insights\n", + " package - update package to v0.6.1\n", + " Geocohort mvt\n", + " Intelligence map\n", + " Map/GeoCohortMap - add setViewportBBox , setApiBBox\n", + " 13 - Data processing\n", + " Work with generated/fake data\n", + " Modal/refactor\n", + " Add map widget\n", + " Lumen tree/tailwindcss\n", + " Commit watch integration and mention in git conventions\n", + " journal - setup Notion dev-journal automation workflow\n", + " losses visualization\n", + " '''.strip().split('\\n')\n", + "]\n", + "\n", + "dids = [\n", + " did.strip() for did in '''\n", + " react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master\n", + " react-maps- upgrade package and make new release 0.6.1\n", + " overlord/geocohort - fix passing the clean list of GeoCohortFSA aggregated data to map #1833\n", + " overlord/geocohort - upgrade @eqworks/react-maps package in #1833 and merge with master\n", + " overlord/geocohort - review [G2M] table/cluster chart enhancement #1829\n", + " overlord/geocohort - add extra tab for GeoCohort overview #1834\n", + " overseer - test last master branch with bbox added to insights/geocohort\n", + " overlord/geocohort - adjust tooltipFormatX and axisBottomLabelDisplayFn for dates in geocohort aggregated data #1834\n", + " overlord/geocohort - add all geocohorts to the time line chart #1834\n", + " react-maps/GeoCohortMap - open for review [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76\n", + " overlord/GeoCohortMap - clean up and open for review [G2M] Report/GeoCohort - use bbox coords to pull postal code level insights\n", + " react-maps - reviewed https://github.com/EQWorks/react-maps/pull/75\n", + " overlord/geocohort - use bbox coords to pull postal code level insights #1833\n", + " overlord/\n", + " react-maps/GeoCohortMap/Map - design both maps to send out bbox coords of the current viewport #76\n", + " atom - test Quebec cities with geocoder and api calls and collaborate with Ianec to fix Quebec cities geom pull from DB\n", + " QL - refactor modal setup\n", + " modal - explore popstate events & add default onbeforeunload alert if there are unsaved changes\n", + " ql-connect - ql design prototype overview + ui/backend updates + discussion on Tailwindcss exploration experience\n", + " modal/refactor - isolate query save-success modal configs\n", + " modal/refactor - isolate query delete modal configs\n", + " modal/refactor - isolate execution cancel modal configs + file cleanups\n", + " modal/refactor - add reset modal configs\n", + " modal/refactor - cleanups + remove unused modal config states\n", + " design - add new buttons for rest/query/cancel-executions\n", + " design/tree-selector - organize classes with clsx\n", + " modal/refactor - re-style Card component\n", + " modal/refactor - isolate Modal component\n", + " modal/refactor - separate common vs ql components\n", + " modal/refactor - re-style Textfield component\n", + " modal/refactor - isolate query save modal configs\n", + " design/tree-selector - replace Tree selector with tailwindcss\n", + " design/tree-selector - create & style List component\n", + " design/tree-selector - create & style Dialog Component\n", + " design/tree-selector - handle search & support ListMenu for tree selector\n", + " design/tree-selector - support TreeMenu for tree selector\n", + " design/explore - explore tailwindcss\n", + " design/explore - explore headlessui\n", + " design/tree-selector - project setup + init tailwindcss\n", + " design/tree-selector - create & style Textfield component\n", + " notion/journal - fix bug with template literals\n", + " notion/journal - add name-transform for displaying number of prev-day incomplete tasks\n", + " notion/journal - setup db retrieving process\n", + " notion/journal - setup journal routine automation workflows\n", + " product - review https://github.com/EQWorks/ws-problems/issues/165 (client-side crashes on server hibernation)\n", + " common - revise https://github.com/EQWorks/common/pull/27\n", + " data - interview https://eqworks.workable.com/backend/jobs/547053/browser/interview/candidate/129031036\n", + " data - interview https://eqworks.workable.com/backend/jobs/547053/browser/interview/candidate/126188189\n", + " snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)\n", + " python-curriculum/12 - elaborate on the use of Categorical columns, hash device ids\n", + " tech-evan - review medium post \"Bridging Web UI into Notebooks\"\n", + " data - review https://github.com/EQWorks/ws-problems/issues/127\n", + " data - review https://github.com/EQWorks/ws-problems/issues/145\n", + " data - review https://github.com/EQWorks/ws-problems/issues/150\n", + " data - review https://github.com/EQWorks/ws-problems/issues/151\n", + " python-curriculum/12 - finalize/grammar cleanup, diversify interactive example (filter by both region/name)\n", + " data - review candidate/120151018 and candidate/121056418\n", + " python-curriculum/12 - written material\n", + " python-curriculum/12 - widgets (nested to select columns and values)\n", + " connector-gcs - create connection_hub_gcs_dev test bucket (under EQ Hyperlocal, no org google cloud proj)\n", + " Airflow - Migrate dev stage workflow to new cluster\n", + " Review the data select process design\n", + " locus-ql - make a user flow map for the views selection process\n", + " design/locus-ql - refractor design so that selected columns are differentiable by view categories\n", + " design/locus-ql - polish the user flow for selecting views and columns, check on possible interactions at each step\n", + " design/locus-ql - incorporate team's feedback into the next iteration of views shopping cart prototype\n", + " locus-ql - wireframe different user flows for views shopping cart feature\n", + " design/locus-ql - built new views popup sequence, selection panel, individual column cards\n", + " '''.strip().split('\\n')\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "0eaed848", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
didprsimsim_norm
0react-maps - merge [G2M] Map/GeoCohortMap - ad...builder - fix issue on loading beacons for VWI...0.7262380.761086
1react-maps - merge [G2M] Map/GeoCohortMap - ad...Hub - change connection type to direct when do...0.7537360.635717
2react-maps - merge [G2M] Map/GeoCohortMap - ad...QL / New Taxonomies + Missing views + Layer views0.4818090.613164
3react-maps - merge [G2M] Map/GeoCohortMap - ad...Popular Times - add ConscientAI for access0.7477020.744181
4react-maps - merge [G2M] Map/GeoCohortMap - ad...Hub - add direct upload as connection type0.7566440.691825
...............
1559design/locus-ql - built new views popup sequen...Add map widget0.5274970.614477
1560design/locus-ql - built new views popup sequen...Lumen tree/tailwindcss0.6017050.580893
1561design/locus-ql - built new views popup sequen...Commit watch integration and mention in git co...0.6619030.598540
1562design/locus-ql - built new views popup sequen...journal - setup Notion dev-journal automation ...0.7498510.745144
1563design/locus-ql - built new views popup sequen...losses visualization0.5234570.523457
\n", + "

1564 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " did \\\n", + "0 react-maps - merge [G2M] Map/GeoCohortMap - ad... \n", + "1 react-maps - merge [G2M] Map/GeoCohortMap - ad... \n", + "2 react-maps - merge [G2M] Map/GeoCohortMap - ad... \n", + "3 react-maps - merge [G2M] Map/GeoCohortMap - ad... \n", + "4 react-maps - merge [G2M] Map/GeoCohortMap - ad... \n", + "... ... \n", + "1559 design/locus-ql - built new views popup sequen... \n", + "1560 design/locus-ql - built new views popup sequen... \n", + "1561 design/locus-ql - built new views popup sequen... \n", + "1562 design/locus-ql - built new views popup sequen... \n", + "1563 design/locus-ql - built new views popup sequen... \n", + "\n", + " pr sim sim_norm \n", + "0 builder - fix issue on loading beacons for VWI... 0.726238 0.761086 \n", + "1 Hub - change connection type to direct when do... 0.753736 0.635717 \n", + "2 QL / New Taxonomies + Missing views + Layer views 0.481809 0.613164 \n", + "3 Popular Times - add ConscientAI for access 0.747702 0.744181 \n", + "4 Hub - add direct upload as connection type 0.756644 0.691825 \n", + "... ... ... ... \n", + "1559 Add map widget 0.527497 0.614477 \n", + "1560 Lumen tree/tailwindcss 0.601705 0.580893 \n", + "1561 Commit watch integration and mention in git co... 0.661903 0.598540 \n", + "1562 journal - setup Notion dev-journal automation ... 0.749851 0.745144 \n", + "1563 losses visualization 0.523457 0.523457 \n", + "\n", + "[1564 rows x 4 columns]" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "data = []\n", + "for did in dids:\n", + " for pr in prs:\n", + " data.append({\n", + " 'did': did,\n", + " 'pr': pr,\n", + " 'sim': sim(model, pr, did),\n", + " 'sim_norm': sim_norm(model, pr, did),\n", + " })\n", + "\n", + "df = pd.DataFrame(data)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "abc500c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
didprsimsim_norm
14react-maps - merge [G2M] Map/GeoCohortMap - ad...Map/GeoCohortMap - add setViewportBBox , setAp...0.8624760.913966
34react-maps- upgrade package and make new relea...package - update package to v0.6.10.8016210.769318
46overlord/geocohort - fix passing the clean lis...builder - fix issue on loading beacons for VWI...0.8354790.798545
47overlord/geocohort - fix passing the clean lis...Hub - change connection type to direct when do...0.8045850.710856
53overlord/geocohort - fix passing the clean lis...Camp/Insights/GeoCohort - Add bounded box filt...0.7697360.824506
...............
1518locus-ql - wireframe different user flows for ...builder - fix issue on loading beacons for VWI...0.8134550.779025
1525locus-ql - wireframe different user flows for ...Camp/Insights/GeoCohort - Add bounded box filt...0.7803830.819209
1528locus-ql - wireframe different user flows for ...Report/GeoCohort - use bbox coords to pull pos...0.8359630.819813
1532locus-ql - wireframe different user flows for ...Map/GeoCohortMap - add setViewportBBox , setAp...0.7253690.804823
1551design/locus-ql - built new views popup sequen...Report/GeoCohort - use bbox coords to pull pos...0.8123390.813832
\n", + "

148 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " did \\\n", + "14 react-maps - merge [G2M] Map/GeoCohortMap - ad... \n", + "34 react-maps- upgrade package and make new relea... \n", + "46 overlord/geocohort - fix passing the clean lis... \n", + "47 overlord/geocohort - fix passing the clean lis... \n", + "53 overlord/geocohort - fix passing the clean lis... \n", + "... ... \n", + "1518 locus-ql - wireframe different user flows for ... \n", + "1525 locus-ql - wireframe different user flows for ... \n", + "1528 locus-ql - wireframe different user flows for ... \n", + "1532 locus-ql - wireframe different user flows for ... \n", + "1551 design/locus-ql - built new views popup sequen... \n", + "\n", + " pr sim sim_norm \n", + "14 Map/GeoCohortMap - add setViewportBBox , setAp... 0.862476 0.913966 \n", + "34 package - update package to v0.6.1 0.801621 0.769318 \n", + "46 builder - fix issue on loading beacons for VWI... 0.835479 0.798545 \n", + "47 Hub - change connection type to direct when do... 0.804585 0.710856 \n", + "53 Camp/Insights/GeoCohort - Add bounded box filt... 0.769736 0.824506 \n", + "... ... ... ... \n", + "1518 builder - fix issue on loading beacons for VWI... 0.813455 0.779025 \n", + "1525 Camp/Insights/GeoCohort - Add bounded box filt... 0.780383 0.819209 \n", + "1528 Report/GeoCohort - use bbox coords to pull pos... 0.835963 0.819813 \n", + "1532 Map/GeoCohortMap - add setViewportBBox , setAp... 0.725369 0.804823 \n", + "1551 Report/GeoCohort - use bbox coords to pull pos... 0.812339 0.813832 \n", + "\n", + "[148 rows x 4 columns]" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# filter by threshold of sim\n", + "dff = df.query('sim >= 0.8 or sim_norm >= 0.8')\n", + "dff" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "21ea58b3", + "metadata": {}, + "outputs": [], + "source": [ + "def printer(df):\n", + " for row in df.itertuples():\n", + " for k, v in row._asdict().items():\n", + " if k != 'Index':\n", + " print(f'{k}: {v}')\n", + "\n", + " print('-' * 10, '\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "a57f7ed4", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
didprsimsim_norm
1390Airflow - Migrate dev stage workflow to new cl...Report/GeoCohort - use bbox coords to pull pos...0.8036920.740882
385QL - refactor modal setupModal/refactor0.7014370.837877
1413Review the data select process designReport/GeoCohort - use bbox coords to pull pos...0.8452250.807889
355atom - test Quebec cities with geocoder and ap...Report/GeoCohort - use bbox coords to pull pos...0.8606910.853623
1367connector-gcs - create connection_hub_gcs_dev ...Report/GeoCohort - use bbox coords to pull pos...0.8341300.852461
1194data - review https://github.com/EQWorks/ws-pr...journal - setup Notion dev-journal automation ...0.8445920.863798
1217data - review https://github.com/EQWorks/ws-pr...journal - setup Notion dev-journal automation ...0.8491670.865895
1240data - review https://github.com/EQWorks/ws-pr...journal - setup Notion dev-journal automation ...0.8472760.865414
1263data - review https://github.com/EQWorks/ws-pr...journal - setup Notion dev-journal automation ...0.8450110.863669
552design - add new buttons for rest/query/cancel...builder - fix issue on loading beacons for VWI...0.8453060.797208
1551design/locus-ql - built new views popup sequen...Report/GeoCohort - use bbox coords to pull pos...0.8123390.813832
1505design/locus-ql - incorporate team's feedback ...Report/GeoCohort - use bbox coords to pull pos...0.8544590.814493
1482design/locus-ql - polish the user flow for sel...Report/GeoCohort - use bbox coords to pull pos...0.8618000.848261
880design/tree-selector - project setup + init ta...New Airflow cluster + task priority + sensor t...0.7954240.813049
1436locus-ql - make a user flow map for the views ...Report/GeoCohort - use bbox coords to pull pos...0.8461600.808592
1528locus-ql - wireframe different user flows for ...Report/GeoCohort - use bbox coords to pull pos...0.8359630.819813
401modal - explore popstate events & add default ...Report/GeoCohort - use bbox coords to pull pos...0.8501130.829083
520modal/refactor - add reset modal configsMap/GeoCohortMap - add setViewportBBox , setAp...0.7692300.812446
539modal/refactor - cleanups + remove unused moda...Report/GeoCohort - use bbox coords to pull pos...0.8118390.794998
638modal/refactor - isolate Modal componentModal/refactor0.7871510.842918
489modal/refactor - isolate execution cancel moda...New Airflow cluster + task priority + sensor t...0.8175020.802322
477modal/refactor - isolate query delete modal co...Modal/refactor0.7710110.832558
699modal/refactor - isolate query save modal configsAdserver/Beacons - Add support for custom cont...0.7720910.805339
446modal/refactor - isolate query save-success mo...Adserver/Beacons - Add support for custom cont...0.7894460.801402
612modal/refactor - re-style Card componentMap/GeoCohortMap - add setViewportBBox , setAp...0.7263900.804693
681modal/refactor - re-style Textfield componentMap/GeoCohortMap - add setViewportBBox , setAp...0.7597230.809706
953notion/journal - add name-transform for displa...Report/GeoCohort - use bbox coords to pull pos...0.8458340.833265
930notion/journal - fix bug with template literalsReport/GeoCohort - use bbox coords to pull pos...0.8100710.781882
987notion/journal - setup db retrieving processjournal - setup Notion dev-journal automation ...0.8788580.872043
1010notion/journal - setup journal routine automat...journal - setup Notion dev-journal automation ...0.9658120.967793
240overlord/GeoCohortMap - clean up and open for ...Report/GeoCohort - use bbox coords to pull pos...0.9343730.954730
194overlord/geocohort - add all geocohorts to the...Report/GeoCohort - use bbox coords to pull pos...0.8466380.801580
125overlord/geocohort - add extra tab for GeoCoho...Report/GeoCohort - use bbox coords to pull pos...0.8132270.807819
171overlord/geocohort - adjust tooltipFormatX and...Report/GeoCohort - use bbox coords to pull pos...0.8759530.853324
56overlord/geocohort - fix passing the clean lis...Report/GeoCohort - use bbox coords to pull pos...0.8884290.893503
79overlord/geocohort - upgrade @eqworks/react-ma...Report/GeoCohort - use bbox coords to pull pos...0.8307060.787613
286overlord/geocohort - use bbox coords to pull p...Report/GeoCohort - use bbox coords to pull pos...0.9768560.975869
148overseer - test last master branch with bbox a...Report/GeoCohort - use bbox coords to pull pos...0.8484600.807371
1012product - review https://github.com/EQWorks/ws...builder - fix issue on loading beacons for VWI...0.8167800.799459
1137python-curriculum/12 - elaborate on the use of...Report/GeoCohort - use bbox coords to pull pos...0.8573420.837314
1275python-curriculum/12 - finalize/grammar cleanu...Report/GeoCohort - use bbox coords to pull pos...0.8570110.846689
1344python-curriculum/12 - widgets (nested to sele...Report/GeoCohort - use bbox coords to pull pos...0.8198850.762207
424ql-connect - ql design prototype overview + ui...Report/GeoCohort - use bbox coords to pull pos...0.8025330.813566
14react-maps - merge [G2M] Map/GeoCohortMap - ad...Map/GeoCohortMap - add setViewportBBox , setAp...0.8624760.913966
34react-maps- upgrade package and make new relea...package - update package to v0.6.10.8016210.769318
221react-maps/GeoCohortMap - open for review [G2M...Map/GeoCohortMap - add setViewportBBox , setAp...0.8851630.935586
332react-maps/GeoCohortMap/Map - design both maps...Report/GeoCohort - use bbox coords to pull pos...0.8873650.880106
1104snoke - fix beacons list not loading in VWI jo...builder - fix issue on loading beacons for VWI...0.9264800.932344
\n", + "
" + ], + "text/plain": [ + " did \\\n", + "1390 Airflow - Migrate dev stage workflow to new cl... \n", + "385 QL - refactor modal setup \n", + "1413 Review the data select process design \n", + "355 atom - test Quebec cities with geocoder and ap... \n", + "1367 connector-gcs - create connection_hub_gcs_dev ... \n", + "1194 data - review https://github.com/EQWorks/ws-pr... \n", + "1217 data - review https://github.com/EQWorks/ws-pr... \n", + "1240 data - review https://github.com/EQWorks/ws-pr... \n", + "1263 data - review https://github.com/EQWorks/ws-pr... \n", + "552 design - add new buttons for rest/query/cancel... \n", + "1551 design/locus-ql - built new views popup sequen... \n", + "1505 design/locus-ql - incorporate team's feedback ... \n", + "1482 design/locus-ql - polish the user flow for sel... \n", + "880 design/tree-selector - project setup + init ta... \n", + "1436 locus-ql - make a user flow map for the views ... \n", + "1528 locus-ql - wireframe different user flows for ... \n", + "401 modal - explore popstate events & add default ... \n", + "520 modal/refactor - add reset modal configs \n", + "539 modal/refactor - cleanups + remove unused moda... \n", + "638 modal/refactor - isolate Modal component \n", + "489 modal/refactor - isolate execution cancel moda... \n", + "477 modal/refactor - isolate query delete modal co... \n", + "699 modal/refactor - isolate query save modal configs \n", + "446 modal/refactor - isolate query save-success mo... \n", + "612 modal/refactor - re-style Card component \n", + "681 modal/refactor - re-style Textfield component \n", + "953 notion/journal - add name-transform for displa... \n", + "930 notion/journal - fix bug with template literals \n", + "987 notion/journal - setup db retrieving process \n", + "1010 notion/journal - setup journal routine automat... \n", + "240 overlord/GeoCohortMap - clean up and open for ... \n", + "194 overlord/geocohort - add all geocohorts to the... \n", + "125 overlord/geocohort - add extra tab for GeoCoho... \n", + "171 overlord/geocohort - adjust tooltipFormatX and... \n", + "56 overlord/geocohort - fix passing the clean lis... \n", + "79 overlord/geocohort - upgrade @eqworks/react-ma... \n", + "286 overlord/geocohort - use bbox coords to pull p... \n", + "148 overseer - test last master branch with bbox a... \n", + "1012 product - review https://github.com/EQWorks/ws... \n", + "1137 python-curriculum/12 - elaborate on the use of... \n", + "1275 python-curriculum/12 - finalize/grammar cleanu... \n", + "1344 python-curriculum/12 - widgets (nested to sele... \n", + "424 ql-connect - ql design prototype overview + ui... \n", + "14 react-maps - merge [G2M] Map/GeoCohortMap - ad... \n", + "34 react-maps- upgrade package and make new relea... \n", + "221 react-maps/GeoCohortMap - open for review [G2M... \n", + "332 react-maps/GeoCohortMap/Map - design both maps... \n", + "1104 snoke - fix beacons list not loading in VWI jo... \n", + "\n", + " pr sim sim_norm \n", + "1390 Report/GeoCohort - use bbox coords to pull pos... 0.803692 0.740882 \n", + "385 Modal/refactor 0.701437 0.837877 \n", + "1413 Report/GeoCohort - use bbox coords to pull pos... 0.845225 0.807889 \n", + "355 Report/GeoCohort - use bbox coords to pull pos... 0.860691 0.853623 \n", + "1367 Report/GeoCohort - use bbox coords to pull pos... 0.834130 0.852461 \n", + "1194 journal - setup Notion dev-journal automation ... 0.844592 0.863798 \n", + "1217 journal - setup Notion dev-journal automation ... 0.849167 0.865895 \n", + "1240 journal - setup Notion dev-journal automation ... 0.847276 0.865414 \n", + "1263 journal - setup Notion dev-journal automation ... 0.845011 0.863669 \n", + "552 builder - fix issue on loading beacons for VWI... 0.845306 0.797208 \n", + "1551 Report/GeoCohort - use bbox coords to pull pos... 0.812339 0.813832 \n", + "1505 Report/GeoCohort - use bbox coords to pull pos... 0.854459 0.814493 \n", + "1482 Report/GeoCohort - use bbox coords to pull pos... 0.861800 0.848261 \n", + "880 New Airflow cluster + task priority + sensor t... 0.795424 0.813049 \n", + "1436 Report/GeoCohort - use bbox coords to pull pos... 0.846160 0.808592 \n", + "1528 Report/GeoCohort - use bbox coords to pull pos... 0.835963 0.819813 \n", + "401 Report/GeoCohort - use bbox coords to pull pos... 0.850113 0.829083 \n", + "520 Map/GeoCohortMap - add setViewportBBox , setAp... 0.769230 0.812446 \n", + "539 Report/GeoCohort - use bbox coords to pull pos... 0.811839 0.794998 \n", + "638 Modal/refactor 0.787151 0.842918 \n", + "489 New Airflow cluster + task priority + sensor t... 0.817502 0.802322 \n", + "477 Modal/refactor 0.771011 0.832558 \n", + "699 Adserver/Beacons - Add support for custom cont... 0.772091 0.805339 \n", + "446 Adserver/Beacons - Add support for custom cont... 0.789446 0.801402 \n", + "612 Map/GeoCohortMap - add setViewportBBox , setAp... 0.726390 0.804693 \n", + "681 Map/GeoCohortMap - add setViewportBBox , setAp... 0.759723 0.809706 \n", + "953 Report/GeoCohort - use bbox coords to pull pos... 0.845834 0.833265 \n", + "930 Report/GeoCohort - use bbox coords to pull pos... 0.810071 0.781882 \n", + "987 journal - setup Notion dev-journal automation ... 0.878858 0.872043 \n", + "1010 journal - setup Notion dev-journal automation ... 0.965812 0.967793 \n", + "240 Report/GeoCohort - use bbox coords to pull pos... 0.934373 0.954730 \n", + "194 Report/GeoCohort - use bbox coords to pull pos... 0.846638 0.801580 \n", + "125 Report/GeoCohort - use bbox coords to pull pos... 0.813227 0.807819 \n", + "171 Report/GeoCohort - use bbox coords to pull pos... 0.875953 0.853324 \n", + "56 Report/GeoCohort - use bbox coords to pull pos... 0.888429 0.893503 \n", + "79 Report/GeoCohort - use bbox coords to pull pos... 0.830706 0.787613 \n", + "286 Report/GeoCohort - use bbox coords to pull pos... 0.976856 0.975869 \n", + "148 Report/GeoCohort - use bbox coords to pull pos... 0.848460 0.807371 \n", + "1012 builder - fix issue on loading beacons for VWI... 0.816780 0.799459 \n", + "1137 Report/GeoCohort - use bbox coords to pull pos... 0.857342 0.837314 \n", + "1275 Report/GeoCohort - use bbox coords to pull pos... 0.857011 0.846689 \n", + "1344 Report/GeoCohort - use bbox coords to pull pos... 0.819885 0.762207 \n", + "424 Report/GeoCohort - use bbox coords to pull pos... 0.802533 0.813566 \n", + "14 Map/GeoCohortMap - add setViewportBBox , setAp... 0.862476 0.913966 \n", + "34 package - update package to v0.6.1 0.801621 0.769318 \n", + "221 Map/GeoCohortMap - add setViewportBBox , setAp... 0.885163 0.935586 \n", + "332 Report/GeoCohort - use bbox coords to pull pos... 0.887365 0.880106 \n", + "1104 builder - fix issue on loading beacons for VWI... 0.926480 0.932344 " + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get the pair of pr, did with the highest sim score\n", + "idx = dff.groupby('did').sim.idxmax()\n", + "sim_pairs = df.iloc[idx]\n", + "sim_pairs" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "4da4fd60", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
didprsimsim_norm
1390Airflow - Migrate dev stage workflow to new cl...Report/GeoCohort - use bbox coords to pull pos...0.8036920.740882
385QL - refactor modal setupModal/refactor0.7014370.837877
1424Review the data select process designjournal - setup Notion dev-journal automation ...0.7944720.833235
355atom - test Quebec cities with geocoder and ap...Report/GeoCohort - use bbox coords to pull pos...0.8606910.853623
1367connector-gcs - create connection_hub_gcs_dev ...Report/GeoCohort - use bbox coords to pull pos...0.8341300.852461
1194data - review https://github.com/EQWorks/ws-pr...journal - setup Notion dev-journal automation ...0.8445920.863798
1217data - review https://github.com/EQWorks/ws-pr...journal - setup Notion dev-journal automation ...0.8491670.865895
1240data - review https://github.com/EQWorks/ws-pr...journal - setup Notion dev-journal automation ...0.8472760.865414
1263data - review https://github.com/EQWorks/ws-pr...journal - setup Notion dev-journal automation ...0.8450110.863669
566design - add new buttons for rest/query/cancel...Map/GeoCohortMap - add setViewportBBox , setAp...0.7261000.811494
1551design/locus-ql - built new views popup sequen...Report/GeoCohort - use bbox coords to pull pos...0.8123390.813832
1505design/locus-ql - incorporate team's feedback ...Report/GeoCohort - use bbox coords to pull pos...0.8544590.814493
1482design/locus-ql - polish the user flow for sel...Report/GeoCohort - use bbox coords to pull pos...0.8618000.848261
880design/tree-selector - project setup + init ta...New Airflow cluster + task priority + sensor t...0.7954240.813049
1436locus-ql - make a user flow map for the views ...Report/GeoCohort - use bbox coords to pull pos...0.8461600.808592
1528locus-ql - wireframe different user flows for ...Report/GeoCohort - use bbox coords to pull pos...0.8359630.819813
401modal - explore popstate events & add default ...Report/GeoCohort - use bbox coords to pull pos...0.8501130.829083
513modal/refactor - add reset modal configsCamp/Insights/GeoCohort - Add bounded box filt...0.7622160.815759
536modal/refactor - cleanups + remove unused moda...Camp/Insights/GeoCohort - Add bounded box filt...0.7826180.808684
638modal/refactor - isolate Modal componentModal/refactor0.7871510.842918
489modal/refactor - isolate execution cancel moda...New Airflow cluster + task priority + sensor t...0.8175020.802322
477modal/refactor - isolate query delete modal co...Modal/refactor0.7710110.832558
707modal/refactor - isolate query save modal configsModal/refactor0.7655450.827165
454modal/refactor - isolate query save-success mo...Modal/refactor0.7609890.831344
612modal/refactor - re-style Card componentMap/GeoCohortMap - add setViewportBBox , setAp...0.7263900.804693
681modal/refactor - re-style Textfield componentMap/GeoCohortMap - add setViewportBBox , setAp...0.7597230.809706
953notion/journal - add name-transform for displa...Report/GeoCohort - use bbox coords to pull pos...0.8458340.833265
930notion/journal - fix bug with template literalsReport/GeoCohort - use bbox coords to pull pos...0.8100710.781882
987notion/journal - setup db retrieving processjournal - setup Notion dev-journal automation ...0.8788580.872043
1010notion/journal - setup journal routine automat...journal - setup Notion dev-journal automation ...0.9658120.967793
240overlord/GeoCohortMap - clean up and open for ...Report/GeoCohort - use bbox coords to pull pos...0.9343730.954730
194overlord/geocohort - add all geocohorts to the...Report/GeoCohort - use bbox coords to pull pos...0.8466380.801580
129overlord/geocohort - add extra tab for GeoCoho...Map/GeoCohortMap - add setViewportBBox , setAp...0.7210310.816096
171overlord/geocohort - adjust tooltipFormatX and...Report/GeoCohort - use bbox coords to pull pos...0.8759530.853324
56overlord/geocohort - fix passing the clean lis...Report/GeoCohort - use bbox coords to pull pos...0.8884290.893503
69overlord/geocohort - upgrade @eqworks/react-ma...builder - fix issue on loading beacons for VWI...0.8174890.806360
286overlord/geocohort - use bbox coords to pull p...Report/GeoCohort - use bbox coords to pull pos...0.9768560.975869
148overseer - test last master branch with bbox a...Report/GeoCohort - use bbox coords to pull pos...0.8484600.807371
1033product - review https://github.com/EQWorks/ws...journal - setup Notion dev-journal automation ...0.8067340.823701
1137python-curriculum/12 - elaborate on the use of...Report/GeoCohort - use bbox coords to pull pos...0.8573420.837314
1275python-curriculum/12 - finalize/grammar cleanu...Report/GeoCohort - use bbox coords to pull pos...0.8570110.846689
1343python-curriculum/12 - widgets (nested to sele...Adserver/Beacons - Add support for custom cont...0.8164930.775432
424ql-connect - ql design prototype overview + ui...Report/GeoCohort - use bbox coords to pull pos...0.8025330.813566
14react-maps - merge [G2M] Map/GeoCohortMap - ad...Map/GeoCohortMap - add setViewportBBox , setAp...0.8624760.913966
34react-maps- upgrade package and make new relea...package - update package to v0.6.10.8016210.769318
221react-maps/GeoCohortMap - open for review [G2M...Map/GeoCohortMap - add setViewportBBox , setAp...0.8851630.935586
332react-maps/GeoCohortMap/Map - design both maps...Report/GeoCohort - use bbox coords to pull pos...0.8873650.880106
1104snoke - fix beacons list not loading in VWI jo...builder - fix issue on loading beacons for VWI...0.9264800.932344
\n", + "
" + ], + "text/plain": [ + " did \\\n", + "1390 Airflow - Migrate dev stage workflow to new cl... \n", + "385 QL - refactor modal setup \n", + "1424 Review the data select process design \n", + "355 atom - test Quebec cities with geocoder and ap... \n", + "1367 connector-gcs - create connection_hub_gcs_dev ... \n", + "1194 data - review https://github.com/EQWorks/ws-pr... \n", + "1217 data - review https://github.com/EQWorks/ws-pr... \n", + "1240 data - review https://github.com/EQWorks/ws-pr... \n", + "1263 data - review https://github.com/EQWorks/ws-pr... \n", + "566 design - add new buttons for rest/query/cancel... \n", + "1551 design/locus-ql - built new views popup sequen... \n", + "1505 design/locus-ql - incorporate team's feedback ... \n", + "1482 design/locus-ql - polish the user flow for sel... \n", + "880 design/tree-selector - project setup + init ta... \n", + "1436 locus-ql - make a user flow map for the views ... \n", + "1528 locus-ql - wireframe different user flows for ... \n", + "401 modal - explore popstate events & add default ... \n", + "513 modal/refactor - add reset modal configs \n", + "536 modal/refactor - cleanups + remove unused moda... \n", + "638 modal/refactor - isolate Modal component \n", + "489 modal/refactor - isolate execution cancel moda... \n", + "477 modal/refactor - isolate query delete modal co... \n", + "707 modal/refactor - isolate query save modal configs \n", + "454 modal/refactor - isolate query save-success mo... \n", + "612 modal/refactor - re-style Card component \n", + "681 modal/refactor - re-style Textfield component \n", + "953 notion/journal - add name-transform for displa... \n", + "930 notion/journal - fix bug with template literals \n", + "987 notion/journal - setup db retrieving process \n", + "1010 notion/journal - setup journal routine automat... \n", + "240 overlord/GeoCohortMap - clean up and open for ... \n", + "194 overlord/geocohort - add all geocohorts to the... \n", + "129 overlord/geocohort - add extra tab for GeoCoho... \n", + "171 overlord/geocohort - adjust tooltipFormatX and... \n", + "56 overlord/geocohort - fix passing the clean lis... \n", + "69 overlord/geocohort - upgrade @eqworks/react-ma... \n", + "286 overlord/geocohort - use bbox coords to pull p... \n", + "148 overseer - test last master branch with bbox a... \n", + "1033 product - review https://github.com/EQWorks/ws... \n", + "1137 python-curriculum/12 - elaborate on the use of... \n", + "1275 python-curriculum/12 - finalize/grammar cleanu... \n", + "1343 python-curriculum/12 - widgets (nested to sele... \n", + "424 ql-connect - ql design prototype overview + ui... \n", + "14 react-maps - merge [G2M] Map/GeoCohortMap - ad... \n", + "34 react-maps- upgrade package and make new relea... \n", + "221 react-maps/GeoCohortMap - open for review [G2M... \n", + "332 react-maps/GeoCohortMap/Map - design both maps... \n", + "1104 snoke - fix beacons list not loading in VWI jo... \n", + "\n", + " pr sim sim_norm \n", + "1390 Report/GeoCohort - use bbox coords to pull pos... 0.803692 0.740882 \n", + "385 Modal/refactor 0.701437 0.837877 \n", + "1424 journal - setup Notion dev-journal automation ... 0.794472 0.833235 \n", + "355 Report/GeoCohort - use bbox coords to pull pos... 0.860691 0.853623 \n", + "1367 Report/GeoCohort - use bbox coords to pull pos... 0.834130 0.852461 \n", + "1194 journal - setup Notion dev-journal automation ... 0.844592 0.863798 \n", + "1217 journal - setup Notion dev-journal automation ... 0.849167 0.865895 \n", + "1240 journal - setup Notion dev-journal automation ... 0.847276 0.865414 \n", + "1263 journal - setup Notion dev-journal automation ... 0.845011 0.863669 \n", + "566 Map/GeoCohortMap - add setViewportBBox , setAp... 0.726100 0.811494 \n", + "1551 Report/GeoCohort - use bbox coords to pull pos... 0.812339 0.813832 \n", + "1505 Report/GeoCohort - use bbox coords to pull pos... 0.854459 0.814493 \n", + "1482 Report/GeoCohort - use bbox coords to pull pos... 0.861800 0.848261 \n", + "880 New Airflow cluster + task priority + sensor t... 0.795424 0.813049 \n", + "1436 Report/GeoCohort - use bbox coords to pull pos... 0.846160 0.808592 \n", + "1528 Report/GeoCohort - use bbox coords to pull pos... 0.835963 0.819813 \n", + "401 Report/GeoCohort - use bbox coords to pull pos... 0.850113 0.829083 \n", + "513 Camp/Insights/GeoCohort - Add bounded box filt... 0.762216 0.815759 \n", + "536 Camp/Insights/GeoCohort - Add bounded box filt... 0.782618 0.808684 \n", + "638 Modal/refactor 0.787151 0.842918 \n", + "489 New Airflow cluster + task priority + sensor t... 0.817502 0.802322 \n", + "477 Modal/refactor 0.771011 0.832558 \n", + "707 Modal/refactor 0.765545 0.827165 \n", + "454 Modal/refactor 0.760989 0.831344 \n", + "612 Map/GeoCohortMap - add setViewportBBox , setAp... 0.726390 0.804693 \n", + "681 Map/GeoCohortMap - add setViewportBBox , setAp... 0.759723 0.809706 \n", + "953 Report/GeoCohort - use bbox coords to pull pos... 0.845834 0.833265 \n", + "930 Report/GeoCohort - use bbox coords to pull pos... 0.810071 0.781882 \n", + "987 journal - setup Notion dev-journal automation ... 0.878858 0.872043 \n", + "1010 journal - setup Notion dev-journal automation ... 0.965812 0.967793 \n", + "240 Report/GeoCohort - use bbox coords to pull pos... 0.934373 0.954730 \n", + "194 Report/GeoCohort - use bbox coords to pull pos... 0.846638 0.801580 \n", + "129 Map/GeoCohortMap - add setViewportBBox , setAp... 0.721031 0.816096 \n", + "171 Report/GeoCohort - use bbox coords to pull pos... 0.875953 0.853324 \n", + "56 Report/GeoCohort - use bbox coords to pull pos... 0.888429 0.893503 \n", + "69 builder - fix issue on loading beacons for VWI... 0.817489 0.806360 \n", + "286 Report/GeoCohort - use bbox coords to pull pos... 0.976856 0.975869 \n", + "148 Report/GeoCohort - use bbox coords to pull pos... 0.848460 0.807371 \n", + "1033 journal - setup Notion dev-journal automation ... 0.806734 0.823701 \n", + "1137 Report/GeoCohort - use bbox coords to pull pos... 0.857342 0.837314 \n", + "1275 Report/GeoCohort - use bbox coords to pull pos... 0.857011 0.846689 \n", + "1343 Adserver/Beacons - Add support for custom cont... 0.816493 0.775432 \n", + "424 Report/GeoCohort - use bbox coords to pull pos... 0.802533 0.813566 \n", + "14 Map/GeoCohortMap - add setViewportBBox , setAp... 0.862476 0.913966 \n", + "34 package - update package to v0.6.1 0.801621 0.769318 \n", + "221 Map/GeoCohortMap - add setViewportBBox , setAp... 0.885163 0.935586 \n", + "332 Report/GeoCohort - use bbox coords to pull pos... 0.887365 0.880106 \n", + "1104 builder - fix issue on loading beacons for VWI... 0.926480 0.932344 " + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get the pair of pr, did with the highest sim_norm score\n", + "idx = dff.groupby('did').sim_norm.idxmax()\n", + "norm_pairs = df.iloc[idx]\n", + "norm_pairs" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "f46230ce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "did: atom - test Quebec cities with geocoder and api calls and collaborate with Ianec to fix Quebec cities geom pull from DB\n", + "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", + "sim: 0.8606913685798645\n", + "sim_norm: 0.8536229133605957\n", + "---------- \n", + "\n", + "did: connector-gcs - create connection_hub_gcs_dev test bucket (under EQ Hyperlocal, no org google cloud proj)\n", + "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", + "sim: 0.8341295719146729\n", + "sim_norm: 0.8524613380432129\n", + "---------- \n", + "\n", + "did: data - review https://github.com/EQWorks/ws-problems/issues/127\n", + "pr: journal - setup Notion dev-journal automation workflow\n", + "sim: 0.8445922136306763\n", + "sim_norm: 0.8637979626655579\n", + "---------- \n", + "\n", + "did: data - review https://github.com/EQWorks/ws-problems/issues/145\n", + "pr: journal - setup Notion dev-journal automation workflow\n", + "sim: 0.8491669297218323\n", + "sim_norm: 0.8658953905105591\n", + "---------- \n", + "\n", + "did: data - review https://github.com/EQWorks/ws-problems/issues/150\n", + "pr: journal - setup Notion dev-journal automation workflow\n", + "sim: 0.8472760915756226\n", + "sim_norm: 0.8654137253761292\n", + "---------- \n", + "\n", + "did: data - review https://github.com/EQWorks/ws-problems/issues/151\n", + "pr: journal - setup Notion dev-journal automation workflow\n", + "sim: 0.8450114727020264\n", + "sim_norm: 0.8636687397956848\n", + "---------- \n", + "\n", + "did: notion/journal - setup db retrieving process\n", + "pr: journal - setup Notion dev-journal automation workflow\n", + "sim: 0.8788578510284424\n", + "sim_norm: 0.8720427751541138\n", + "---------- \n", + "\n", + "did: notion/journal - setup journal routine automation workflows\n", + "pr: journal - setup Notion dev-journal automation workflow\n", + "sim: 0.9658119082450867\n", + "sim_norm: 0.9677931666374207\n", + "---------- \n", + "\n", + "did: overlord/GeoCohortMap - clean up and open for review [G2M] Report/GeoCohort - use bbox coords to pull postal code level insights\n", + "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", + "sim: 0.9343725442886353\n", + "sim_norm: 0.9547296166419983\n", + "---------- \n", + "\n", + "did: overlord/geocohort - adjust tooltipFormatX and axisBottomLabelDisplayFn for dates in geocohort aggregated data #1834\n", + "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", + "sim: 0.875952959060669\n", + "sim_norm: 0.8533236384391785\n", + "---------- \n", + "\n", + "did: overlord/geocohort - fix passing the clean list of GeoCohortFSA aggregated data to map #1833\n", + "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", + "sim: 0.888428807258606\n", + "sim_norm: 0.8935034275054932\n", + "---------- \n", + "\n", + "did: overlord/geocohort - use bbox coords to pull postal code level insights #1833\n", + "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", + "sim: 0.9768564105033875\n", + "sim_norm: 0.975869357585907\n", + "---------- \n", + "\n", + "did: react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master\n", + "pr: Map/GeoCohortMap - add setViewportBBox , setApiBBox\n", + "sim: 0.8624761700630188\n", + "sim_norm: 0.9139657616615295\n", + "---------- \n", + "\n", + "did: react-maps/GeoCohortMap - open for review [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76\n", + "pr: Map/GeoCohortMap - add setViewportBBox , setApiBBox\n", + "sim: 0.8851630091667175\n", + "sim_norm: 0.9355862736701965\n", + "---------- \n", + "\n", + "did: react-maps/GeoCohortMap/Map - design both maps to send out bbox coords of the current viewport #76\n", + "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", + "sim: 0.8873649835586548\n", + "sim_norm: 0.88010573387146\n", + "---------- \n", + "\n", + "did: snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)\n", + "pr: builder - fix issue on loading beacons for VWI job creation\n", + "sim: 0.9264801144599915\n", + "sim_norm: 0.9323441386222839\n", + "---------- \n", + "\n" + ] + } + ], + "source": [ + "dedupe = norm_pairs.query('sim_norm >= 0.85')\n", + "printer(dedupe)" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "fd79330b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(90, 16)" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(did), len(dedupe)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59ba9378", + "metadata": {}, "outputs": [], "source": [] } From 82c0ba09068ea61f0fb6d80cc63033ec05973c11 Mon Sep 17 00:00:00 2001 From: "Runzhou Li (woozyking)" Date: Tue, 15 Jun 2021 21:15:07 -0400 Subject: [PATCH 3/3] nlp - apply gensim normalization (preprocess) functions --- nlp/Pipfile | 4 + nlp/similarity.ipynb | 1708 +++++++++++++----------------------------- 2 files changed, 529 insertions(+), 1183 deletions(-) diff --git a/nlp/Pipfile b/nlp/Pipfile index 19986f6..096a8b0 100644 --- a/nlp/Pipfile +++ b/nlp/Pipfile @@ -5,9 +5,13 @@ name = "pypi" [packages] fasttext = "*" +gensim = "*" +pyemd = "*" +python-levenshtein = "*" [dev-packages] jupyter = "*" +pandas = "*" [requires] python_version = "3.9" diff --git a/nlp/similarity.ipynb b/nlp/similarity.ipynb index 333be09..78d9ab9 100644 --- a/nlp/similarity.ipynb +++ b/nlp/similarity.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 11, + "execution_count": 86, "id": "956b2884-ac87-4149-b753-aab18a6a51af", "metadata": {}, "outputs": [], @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 87, "id": "e75dbff3", "metadata": {}, "outputs": [], @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 88, "id": "89dd445e-d362-4025-884e-173312194877", "metadata": {}, "outputs": [ @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 89, "id": "c98b13eb-f59e-4949-83de-27809c214d93", "metadata": {}, "outputs": [ @@ -65,32 +65,7 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "7f907a96-60e1-45d2-bbe7-0771740a3769", - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "For now we only support quantization of supervised models", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquantize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# unsupervised word embedding cannot be quantized\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# This means that we have to allow loading of the model on-demand instead bundling like the supervised model for release notes classification\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/share/virtualenvs/nlp-qPCzfwe0/lib/python3.9/site-packages/fasttext/FastText.py\u001b[0m in \u001b[0;36mquantize\u001b[0;34m(self, input, qout, cutoff, retrain, epoch, lr, thread, verbose, dsub, qnorm)\u001b[0m\n\u001b[1;32m 356\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minput\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 357\u001b[0m \u001b[0minput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 358\u001b[0;31m self.f.quantize(\n\u001b[0m\u001b[1;32m 359\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mqout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcutoff\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepoch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdsub\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[0mqnorm\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: For now we only support quantization of supervised models" - ] - } - ], - "source": [ - "model.quantize() # unsupervised word embedding cannot be quantized\n", - "\n", - "# This means that we have to allow loading of the model on-demand instead bundling like the supervised model for release notes classification" - ] - }, - { - "cell_type": "code", - "execution_count": 17, + "execution_count": 90, "id": "548bd065-d956-4c8e-9b6f-25b5ec63c950", "metadata": {}, "outputs": [], @@ -110,301 +85,160 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "a9399c71-09f3-428a-8694-c9bf58ece0a1", + "execution_count": 123, + "id": "9dd0237e-b982-40cc-a072-81f6cdaca54b", "metadata": {}, "outputs": [], "source": [ - "def sim(model, a, b):\n", - " if type(model) is str:\n", - " model = fasttext.load_model(model)\n", - "\n", - " return cos_sim(*map(model.get_sentence_vector, (a, b)))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "1760f1bd-46d7-47b5-adc8-b7537c851571", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n" - ] - }, - { - "data": { - "text/plain": [ - "0.9222608" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sim(\n", - " './wiki_1b.bin',\n", - " 'fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',\n", - " 'fix issue on loading beacons for VWI job creation',\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "c902ff25-1ea5-4232-b1c6-8a8fa8d9a92f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.9222608" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# prefix stripped\n", - "sim(\n", - " model,\n", - " 'fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',\n", - " 'fix issue on loading beacons for VWI job creation',\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "368f5b45-b38a-44f7-a6c9-58f47d46fe81", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.9264801" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# prefix intact\n", - "sim(\n", - " model,\n", - " 'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',\n", - " 'builder - fix issue on loading beacons for VWI job creation',\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "30ac8064-509d-4115-be3e-02ec809b4d9b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.79175115" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sim(\n", - " model,\n", - " 'modal - explore popstate events & add default onbeforeunload alert if there are unsaved changes',\n", - " 'builder - fix issue on loading beacons for VWI job creation',\n", - ")" + "from gensim.utils import simple_preprocess, tokenize\n", + "from gensim.parsing.preprocessing import remove_stopwords, preprocess_string" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "bb87108c-a618-481c-a01f-d43151fb08a1", + "execution_count": 125, + "id": "29fecdc8", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.86247617" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "sim(\n", - " model,\n", - " 'react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master',\n", - " 'Map/GeoCohortMap - add setViewportBBox , setApiBBox'\n", - ")" + "def join_list(l, d = ' '):\n", + " return d.join(l)" ] }, { "cell_type": "code", - "execution_count": 2, - "id": "9dd0237e-b982-40cc-a072-81f6cdaca54b", + "execution_count": 127, + "id": "a484935d", "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/Users/woozyking/.local/share/virtualenvs/nlp-qPCzfwe0/lib/python3.9/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", - " warnings.warn(msg)\n" + "builder - fix issue on loading beacons for VWI job creation\n", + "fn: simple_preprocess\n", + "builder fix issue on loading beacons for vwi job creation\n", + "fn: tokenize\n", + "builder fix issue on loading beacons for VWI job creation\n", + "-----\n", + "snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)\n", + "fn: simple_preprocess\n", + "snoke fix beacons list not loading in vwi job creation raised in https app asana com\n", + "fn: tokenize\n", + "snoke fix beacons list not loading in VWI job creation raised in https app asana com\n", + "-----\n" ] } ], "source": [ - "# clear stopwords using gensim preprocessing collection\n", - "from gensim.parsing.preprocessing import STOPWORDS" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "3287c2df", - "metadata": {}, - "outputs": [], - "source": [ - "def normalize(s):\n", - " s = s.lower().split()\n", - " s = [w for w in s if w not in STOPWORDS]\n", - " return ' '.join(s)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "cde36f02", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'react-maps - merge [g2m] map/geocohortmap - add setviewportbbox , setapibbox #76 master'" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "normalize('react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master')" + "pr = 'builder - fix issue on loading beacons for VWI job creation'\n", + "did = 'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)'\n", + "\n", + "for s in (pr, did):\n", + " print(s)\n", + " for fn in (simple_preprocess, tokenize):\n", + " print('fn:', fn.__name__)\n", + " print(join_list(fn(s)))\n", + " print('-----')" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 128, "id": "e85d1709", "metadata": {}, "outputs": [], "source": [ - "def sim_norm(model, a, b, normalizations = (normalize, model.get_sentence_vector)):\n", + "def sim(model, a, b, normalizations = (simple_preprocess, join_list)):\n", " if type(model) is str:\n", " model = fasttext.load_model(model)\n", "\n", " for fn in normalizations:\n", " a, b = map(fn, (a, b))\n", "\n", - " return cos_sim(a, b)" + " return cos_sim(*map(model.get_sentence_vector, (a, b)))" ] }, { "cell_type": "code", - "execution_count": 39, - "id": "755cfb18", + "execution_count": 129, + "id": "ea4c48d0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.91396576" + "0.91939306" ] }, - "execution_count": 39, + "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sim_norm(\n", + "sim(\n", " model,\n", - " 'react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master',\n", - " 'Map/GeoCohortMap - add setViewportBBox , setApiBBox'\n", + " 'builder - fix issue on loading beacons for VWI job creation',\n", + " 'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',\n", ")" ] }, { "cell_type": "code", - "execution_count": 40, - "id": "616a5181", + "execution_count": 130, + "id": "b8c0cd73", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.79462" + "0.9049676" ] }, - "execution_count": 40, + "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sim_norm(\n", + "sim(\n", " model,\n", - " 'modal - explore popstate events & add default onbeforeunload alert if there are unsaved changes',\n", " 'builder - fix issue on loading beacons for VWI job creation',\n", + " 'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',\n", + " normalizations=[tokenize, join_list],\n", ")" ] }, { "cell_type": "code", - "execution_count": 41, - "id": "2005c41b", + "execution_count": 131, + "id": "59e3596f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.93234414" + "0.9264801" ] }, - "execution_count": 41, + "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sim_norm(\n", + "sim(\n", " model,\n", - " 'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',\n", " 'builder - fix issue on loading beacons for VWI job creation',\n", + " 'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',\n", + " normalizations = [],\n", ")" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 132, "id": "dfd47891", "metadata": {}, "outputs": [], @@ -513,7 +347,17 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 133, + "id": "5add6db0", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 134, "id": "0eaed848", "metadata": {}, "outputs": [ @@ -542,6 +386,7 @@ " pr\n", " sim\n", " sim_norm\n", + " sim_tok\n", " \n", " \n", " \n", @@ -550,35 +395,40 @@ " react-maps - merge [G2M] Map/GeoCohortMap - ad...\n", " builder - fix issue on loading beacons for VWI...\n", " 0.726238\n", - " 0.761086\n", + " 0.775629\n", + " 0.764320\n", " \n", " \n", " 1\n", " react-maps - merge [G2M] Map/GeoCohortMap - ad...\n", " Hub - change connection type to direct when do...\n", " 0.753736\n", - " 0.635717\n", + " 0.758098\n", + " 0.787245\n", " \n", " \n", " 2\n", " react-maps - merge [G2M] Map/GeoCohortMap - ad...\n", " QL / New Taxonomies + Missing views + Layer views\n", " 0.481809\n", - " 0.613164\n", + " 0.741818\n", + " 0.597494\n", " \n", " \n", " 3\n", " react-maps - merge [G2M] Map/GeoCohortMap - ad...\n", " Popular Times - add ConscientAI for access\n", " 0.747702\n", - " 0.744181\n", + " 0.728598\n", + " 0.708794\n", " \n", " \n", " 4\n", " react-maps - merge [G2M] Map/GeoCohortMap - ad...\n", " Hub - add direct upload as connection type\n", " 0.756644\n", - " 0.691825\n", + " 0.776477\n", + " 0.786975\n", " \n", " \n", " ...\n", @@ -586,45 +436,51 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", " 1559\n", " design/locus-ql - built new views popup sequen...\n", " Add map widget\n", " 0.527497\n", - " 0.614477\n", + " 0.659233\n", + " 0.601145\n", " \n", " \n", " 1560\n", " design/locus-ql - built new views popup sequen...\n", " Lumen tree/tailwindcss\n", " 0.601705\n", - " 0.580893\n", + " 0.678209\n", + " 0.662537\n", " \n", " \n", " 1561\n", " design/locus-ql - built new views popup sequen...\n", " Commit watch integration and mention in git co...\n", " 0.661903\n", - " 0.598540\n", + " 0.670235\n", + " 0.677619\n", " \n", " \n", " 1562\n", " design/locus-ql - built new views popup sequen...\n", " journal - setup Notion dev-journal automation ...\n", " 0.749851\n", - " 0.745144\n", + " 0.743611\n", + " 0.749648\n", " \n", " \n", " 1563\n", " design/locus-ql - built new views popup sequen...\n", " losses visualization\n", " 0.523457\n", - " 0.523457\n", + " 0.576668\n", + " 0.576668\n", " \n", " \n", "\n", - "

1564 rows × 4 columns

\n", + "

1564 rows × 5 columns

\n", "" ], "text/plain": [ @@ -641,38 +497,50 @@ "1562 design/locus-ql - built new views popup sequen... \n", "1563 design/locus-ql - built new views popup sequen... \n", "\n", - " pr sim sim_norm \n", - "0 builder - fix issue on loading beacons for VWI... 0.726238 0.761086 \n", - "1 Hub - change connection type to direct when do... 0.753736 0.635717 \n", - "2 QL / New Taxonomies + Missing views + Layer views 0.481809 0.613164 \n", - "3 Popular Times - add ConscientAI for access 0.747702 0.744181 \n", - "4 Hub - add direct upload as connection type 0.756644 0.691825 \n", - "... ... ... ... \n", - "1559 Add map widget 0.527497 0.614477 \n", - "1560 Lumen tree/tailwindcss 0.601705 0.580893 \n", - "1561 Commit watch integration and mention in git co... 0.661903 0.598540 \n", - "1562 journal - setup Notion dev-journal automation ... 0.749851 0.745144 \n", - "1563 losses visualization 0.523457 0.523457 \n", + " pr sim sim_norm \\\n", + "0 builder - fix issue on loading beacons for VWI... 0.726238 0.775629 \n", + "1 Hub - change connection type to direct when do... 0.753736 0.758098 \n", + "2 QL / New Taxonomies + Missing views + Layer views 0.481809 0.741818 \n", + "3 Popular Times - add ConscientAI for access 0.747702 0.728598 \n", + "4 Hub - add direct upload as connection type 0.756644 0.776477 \n", + "... ... ... ... \n", + "1559 Add map widget 0.527497 0.659233 \n", + "1560 Lumen tree/tailwindcss 0.601705 0.678209 \n", + "1561 Commit watch integration and mention in git co... 0.661903 0.670235 \n", + "1562 journal - setup Notion dev-journal automation ... 0.749851 0.743611 \n", + "1563 losses visualization 0.523457 0.576668 \n", "\n", - "[1564 rows x 4 columns]" + " sim_tok \n", + "0 0.764320 \n", + "1 0.787245 \n", + "2 0.597494 \n", + "3 0.708794 \n", + "4 0.786975 \n", + "... ... \n", + "1559 0.601145 \n", + "1560 0.662537 \n", + "1561 0.677619 \n", + "1562 0.749648 \n", + "1563 0.576668 \n", + "\n", + "[1564 rows x 5 columns]" ] }, - "execution_count": 105, + "execution_count": 134, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import pandas as pd\n", - "\n", "data = []\n", "for did in dids:\n", " for pr in prs:\n", " data.append({\n", " 'did': did,\n", " 'pr': pr,\n", - " 'sim': sim(model, pr, did),\n", - " 'sim_norm': sim_norm(model, pr, did),\n", + " 'sim': sim(model, pr, did, normalizations=[]),\n", + " 'sim_norm': sim(model, pr, did),\n", + " 'sim_tok': sim(model, pr, did, normalizations=[tokenize, join_list]),\n", " })\n", "\n", "df = pd.DataFrame(data)\n", @@ -681,7 +549,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 135, "id": "abc500c8", "metadata": {}, "outputs": [ @@ -710,6 +578,7 @@ " pr\n", " sim\n", " sim_norm\n", + " sim_tok\n", " \n", " \n", " \n", @@ -718,127 +587,104 @@ " react-maps - merge [G2M] Map/GeoCohortMap - ad...\n", " Map/GeoCohortMap - add setViewportBBox , setAp...\n", " 0.862476\n", - " 0.913966\n", + " 0.941750\n", + " 0.902895\n", " \n", " \n", - " 34\n", - " react-maps- upgrade package and make new relea...\n", - " package - update package to v0.6.1\n", - " 0.801621\n", - " 0.769318\n", + " 221\n", + " react-maps/GeoCohortMap - open for review [G2M...\n", + " Map/GeoCohortMap - add setViewportBBox , setAp...\n", + " 0.885163\n", + " 0.956354\n", + " 0.917622\n", " \n", " \n", - " 46\n", - " overlord/geocohort - fix passing the clean lis...\n", - " builder - fix issue on loading beacons for VWI...\n", - " 0.835479\n", - " 0.798545\n", + " 240\n", + " overlord/GeoCohortMap - clean up and open for ...\n", + " Report/GeoCohort - use bbox coords to pull pos...\n", + " 0.934373\n", + " 0.958219\n", + " 0.942460\n", " \n", " \n", - " 47\n", - " overlord/geocohort - fix passing the clean lis...\n", - " Hub - change connection type to direct when do...\n", - " 0.804585\n", - " 0.710856\n", + " 286\n", + " overlord/geocohort - use bbox coords to pull p...\n", + " Report/GeoCohort - use bbox coords to pull pos...\n", + " 0.976856\n", + " 0.981589\n", + " 0.977886\n", " \n", " \n", - " 53\n", - " overlord/geocohort - fix passing the clean lis...\n", - " Camp/Insights/GeoCohort - Add bounded box filt...\n", - " 0.769736\n", - " 0.824506\n", + " 638\n", + " modal/refactor - isolate Modal component\n", + " Modal/refactor\n", + " 0.787151\n", + " 0.923675\n", + " 0.900247\n", " \n", " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", + " 1010\n", + " notion/journal - setup journal routine automat...\n", + " journal - setup Notion dev-journal automation ...\n", + " 0.965812\n", + " 0.960012\n", + " 0.952789\n", " \n", " \n", - " 1518\n", - " locus-ql - wireframe different user flows for ...\n", + " 1104\n", + " snoke - fix beacons list not loading in VWI jo...\n", " builder - fix issue on loading beacons for VWI...\n", - " 0.813455\n", - " 0.779025\n", - " \n", - " \n", - " 1525\n", - " locus-ql - wireframe different user flows for ...\n", - " Camp/Insights/GeoCohort - Add bounded box filt...\n", - " 0.780383\n", - " 0.819209\n", - " \n", - " \n", - " 1528\n", - " locus-ql - wireframe different user flows for ...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.835963\n", - " 0.819813\n", - " \n", - " \n", - " 1532\n", - " locus-ql - wireframe different user flows for ...\n", - " Map/GeoCohortMap - add setViewportBBox , setAp...\n", - " 0.725369\n", - " 0.804823\n", - " \n", - " \n", - " 1551\n", - " design/locus-ql - built new views popup sequen...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.812339\n", - " 0.813832\n", + " 0.926480\n", + " 0.919393\n", + " 0.904968\n", " \n", " \n", "\n", - "

148 rows × 4 columns

\n", "" ], "text/plain": [ " did \\\n", "14 react-maps - merge [G2M] Map/GeoCohortMap - ad... \n", - "34 react-maps- upgrade package and make new relea... \n", - "46 overlord/geocohort - fix passing the clean lis... \n", - "47 overlord/geocohort - fix passing the clean lis... \n", - "53 overlord/geocohort - fix passing the clean lis... \n", - "... ... \n", - "1518 locus-ql - wireframe different user flows for ... \n", - "1525 locus-ql - wireframe different user flows for ... \n", - "1528 locus-ql - wireframe different user flows for ... \n", - "1532 locus-ql - wireframe different user flows for ... \n", - "1551 design/locus-ql - built new views popup sequen... \n", + "221 react-maps/GeoCohortMap - open for review [G2M... \n", + "240 overlord/GeoCohortMap - clean up and open for ... \n", + "286 overlord/geocohort - use bbox coords to pull p... \n", + "638 modal/refactor - isolate Modal component \n", + "1010 notion/journal - setup journal routine automat... \n", + "1104 snoke - fix beacons list not loading in VWI jo... \n", "\n", - " pr sim sim_norm \n", - "14 Map/GeoCohortMap - add setViewportBBox , setAp... 0.862476 0.913966 \n", - "34 package - update package to v0.6.1 0.801621 0.769318 \n", - "46 builder - fix issue on loading beacons for VWI... 0.835479 0.798545 \n", - "47 Hub - change connection type to direct when do... 0.804585 0.710856 \n", - "53 Camp/Insights/GeoCohort - Add bounded box filt... 0.769736 0.824506 \n", - "... ... ... ... \n", - "1518 builder - fix issue on loading beacons for VWI... 0.813455 0.779025 \n", - "1525 Camp/Insights/GeoCohort - Add bounded box filt... 0.780383 0.819209 \n", - "1528 Report/GeoCohort - use bbox coords to pull pos... 0.835963 0.819813 \n", - "1532 Map/GeoCohortMap - add setViewportBBox , setAp... 0.725369 0.804823 \n", - "1551 Report/GeoCohort - use bbox coords to pull pos... 0.812339 0.813832 \n", + " pr sim sim_norm \\\n", + "14 Map/GeoCohortMap - add setViewportBBox , setAp... 0.862476 0.941750 \n", + "221 Map/GeoCohortMap - add setViewportBBox , setAp... 0.885163 0.956354 \n", + "240 Report/GeoCohort - use bbox coords to pull pos... 0.934373 0.958219 \n", + "286 Report/GeoCohort - use bbox coords to pull pos... 0.976856 0.981589 \n", + "638 Modal/refactor 0.787151 0.923675 \n", + "1010 journal - setup Notion dev-journal automation ... 0.965812 0.960012 \n", + "1104 builder - fix issue on loading beacons for VWI... 0.926480 0.919393 \n", "\n", - "[148 rows x 4 columns]" + " sim_tok \n", + "14 0.902895 \n", + "221 0.917622 \n", + "240 0.942460 \n", + "286 0.977886 \n", + "638 0.900247 \n", + "1010 0.952789 \n", + "1104 0.904968 " ] }, - "execution_count": 106, + "execution_count": 135, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# filter by threshold of sim\n", - "dff = df.query('sim >= 0.8 or sim_norm >= 0.8')\n", + "dff = df.query('sim >= 0.9 or sim_norm >= 0.9 or sim_tok >= 0.9')\n", "dff" ] }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 136, "id": "21ea58b3", "metadata": {}, "outputs": [], @@ -854,7 +700,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 137, "id": "a57f7ed4", "metadata": { "scrolled": false @@ -885,344 +731,203 @@ " pr\n", " sim\n", " sim_norm\n", + " sim_tok\n", " \n", " \n", " \n", " \n", - " 1390\n", - " Airflow - Migrate dev stage workflow to new cl...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.803692\n", - " 0.740882\n", - " \n", - " \n", - " 385\n", - " QL - refactor modal setup\n", + " 638\n", + " modal/refactor - isolate Modal component\n", " Modal/refactor\n", - " 0.701437\n", - " 0.837877\n", + " 0.787151\n", + " 0.923675\n", + " 0.900247\n", " \n", " \n", - " 1413\n", - " Review the data select process design\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.845225\n", - " 0.807889\n", + " 1010\n", + " notion/journal - setup journal routine automat...\n", + " journal - setup Notion dev-journal automation ...\n", + " 0.965812\n", + " 0.960012\n", + " 0.952789\n", " \n", " \n", - " 355\n", - " atom - test Quebec cities with geocoder and ap...\n", + " 240\n", + " overlord/GeoCohortMap - clean up and open for ...\n", " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.860691\n", - " 0.853623\n", + " 0.934373\n", + " 0.958219\n", + " 0.942460\n", " \n", " \n", - " 1367\n", - " connector-gcs - create connection_hub_gcs_dev ...\n", + " 286\n", + " overlord/geocohort - use bbox coords to pull p...\n", " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.834130\n", - " 0.852461\n", - " \n", - " \n", - " 1194\n", - " data - review https://github.com/EQWorks/ws-pr...\n", - " journal - setup Notion dev-journal automation ...\n", - " 0.844592\n", - " 0.863798\n", - " \n", - " \n", - " 1217\n", - " data - review https://github.com/EQWorks/ws-pr...\n", - " journal - setup Notion dev-journal automation ...\n", - " 0.849167\n", - " 0.865895\n", + " 0.976856\n", + " 0.981589\n", + " 0.977886\n", " \n", " \n", - " 1240\n", - " data - review https://github.com/EQWorks/ws-pr...\n", - " journal - setup Notion dev-journal automation ...\n", - " 0.847276\n", - " 0.865414\n", + " 14\n", + " react-maps - merge [G2M] Map/GeoCohortMap - ad...\n", + " Map/GeoCohortMap - add setViewportBBox , setAp...\n", + " 0.862476\n", + " 0.941750\n", + " 0.902895\n", " \n", " \n", - " 1263\n", - " data - review https://github.com/EQWorks/ws-pr...\n", - " journal - setup Notion dev-journal automation ...\n", - " 0.845011\n", - " 0.863669\n", + " 221\n", + " react-maps/GeoCohortMap - open for review [G2M...\n", + " Map/GeoCohortMap - add setViewportBBox , setAp...\n", + " 0.885163\n", + " 0.956354\n", + " 0.917622\n", " \n", " \n", - " 552\n", - " design - add new buttons for rest/query/cancel...\n", + " 1104\n", + " snoke - fix beacons list not loading in VWI jo...\n", " builder - fix issue on loading beacons for VWI...\n", - " 0.845306\n", - " 0.797208\n", - " \n", - " \n", - " 1551\n", - " design/locus-ql - built new views popup sequen...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.812339\n", - " 0.813832\n", - " \n", - " \n", - " 1505\n", - " design/locus-ql - incorporate team's feedback ...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.854459\n", - " 0.814493\n", + " 0.926480\n", + " 0.919393\n", + " 0.904968\n", " \n", - " \n", - " 1482\n", - " design/locus-ql - polish the user flow for sel...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.861800\n", - " 0.848261\n", - " \n", - " \n", - " 880\n", - " design/tree-selector - project setup + init ta...\n", - " New Airflow cluster + task priority + sensor t...\n", - " 0.795424\n", - " 0.813049\n", - " \n", - " \n", - " 1436\n", - " locus-ql - make a user flow map for the views ...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.846160\n", - " 0.808592\n", - " \n", - " \n", - " 1528\n", - " locus-ql - wireframe different user flows for ...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.835963\n", - " 0.819813\n", - " \n", - " \n", - " 401\n", - " modal - explore popstate events & add default ...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.850113\n", - " 0.829083\n", - " \n", - " \n", - " 520\n", - " modal/refactor - add reset modal configs\n", - " Map/GeoCohortMap - add setViewportBBox , setAp...\n", - " 0.769230\n", - " 0.812446\n", - " \n", - " \n", - " 539\n", - " modal/refactor - cleanups + remove unused moda...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.811839\n", - " 0.794998\n", + " \n", + "\n", + "" + ], + "text/plain": [ + " did \\\n", + "638 modal/refactor - isolate Modal component \n", + "1010 notion/journal - setup journal routine automat... \n", + "240 overlord/GeoCohortMap - clean up and open for ... \n", + "286 overlord/geocohort - use bbox coords to pull p... \n", + "14 react-maps - merge [G2M] Map/GeoCohortMap - ad... \n", + "221 react-maps/GeoCohortMap - open for review [G2M... \n", + "1104 snoke - fix beacons list not loading in VWI jo... \n", + "\n", + " pr sim sim_norm \\\n", + "638 Modal/refactor 0.787151 0.923675 \n", + "1010 journal - setup Notion dev-journal automation ... 0.965812 0.960012 \n", + "240 Report/GeoCohort - use bbox coords to pull pos... 0.934373 0.958219 \n", + "286 Report/GeoCohort - use bbox coords to pull pos... 0.976856 0.981589 \n", + "14 Map/GeoCohortMap - add setViewportBBox , setAp... 0.862476 0.941750 \n", + "221 Map/GeoCohortMap - add setViewportBBox , setAp... 0.885163 0.956354 \n", + "1104 builder - fix issue on loading beacons for VWI... 0.926480 0.919393 \n", + "\n", + " sim_tok \n", + "638 0.900247 \n", + "1010 0.952789 \n", + "240 0.942460 \n", + "286 0.977886 \n", + "14 0.902895 \n", + "221 0.917622 \n", + "1104 0.904968 " + ] + }, + "execution_count": 137, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get the pair of pr, did with the highest sim score\n", + "idx = dff.groupby('did').sim.idxmax()\n", + "sim_pairs = df.iloc[idx]\n", + "sim_pairs" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "id": "4da4fd60", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", " \n", " \n", "
didprsimsim_normsim_tok
638modal/refactor - isolate Modal componentModal/refactor0.7871510.842918
489modal/refactor - isolate execution cancel moda...New Airflow cluster + task priority + sensor t...0.8175020.802322
477modal/refactor - isolate query delete modal co...Modal/refactor0.7710110.832558
699modal/refactor - isolate query save modal configsAdserver/Beacons - Add support for custom cont...0.7720910.805339
446modal/refactor - isolate query save-success mo...Adserver/Beacons - Add support for custom cont...0.7894460.801402
612modal/refactor - re-style Card componentMap/GeoCohortMap - add setViewportBBox , setAp...0.7263900.804693
681modal/refactor - re-style Textfield componentMap/GeoCohortMap - add setViewportBBox , setAp...0.7597230.809706
953notion/journal - add name-transform for displa...Report/GeoCohort - use bbox coords to pull pos...0.8458340.833265
930notion/journal - fix bug with template literalsReport/GeoCohort - use bbox coords to pull pos...0.8100710.781882
987notion/journal - setup db retrieving processjournal - setup Notion dev-journal automation ...0.8788580.8720430.9236750.900247
1010notion/journal - setup journal routine automat...journal - setup Notion dev-journal automation ...0.9658120.9677930.9600120.952789
240overlord/GeoCohortMap - clean up and open for ...Report/GeoCohort - use bbox coords to pull pos...0.9343730.954730
194overlord/geocohort - add all geocohorts to the...Report/GeoCohort - use bbox coords to pull pos...0.8466380.801580
125overlord/geocohort - add extra tab for GeoCoho...Report/GeoCohort - use bbox coords to pull pos...0.8132270.807819
171overlord/geocohort - adjust tooltipFormatX and...Report/GeoCohort - use bbox coords to pull pos...0.8759530.853324
56overlord/geocohort - fix passing the clean lis...Report/GeoCohort - use bbox coords to pull pos...0.8884290.893503
79overlord/geocohort - upgrade @eqworks/react-ma...Report/GeoCohort - use bbox coords to pull pos...0.8307060.7876130.9582190.942460
286overlord/geocohort - use bbox coords to pull p...Report/GeoCohort - use bbox coords to pull pos...0.9768560.975869
148overseer - test last master branch with bbox a...Report/GeoCohort - use bbox coords to pull pos...0.8484600.807371
1012product - review https://github.com/EQWorks/ws...builder - fix issue on loading beacons for VWI...0.8167800.799459
1137python-curriculum/12 - elaborate on the use of...Report/GeoCohort - use bbox coords to pull pos...0.8573420.837314
1275python-curriculum/12 - finalize/grammar cleanu...Report/GeoCohort - use bbox coords to pull pos...0.8570110.846689
1344python-curriculum/12 - widgets (nested to sele...Report/GeoCohort - use bbox coords to pull pos...0.8198850.762207
424ql-connect - ql design prototype overview + ui...Report/GeoCohort - use bbox coords to pull pos...0.8025330.8135660.9815890.977886
14react-maps - merge [G2M] Map/GeoCohortMap - ad...Map/GeoCohortMap - add setViewportBBox , setAp...0.8624760.913966
34react-maps- upgrade package and make new relea...package - update package to v0.6.10.8016210.7693180.9417500.902895
221react-maps/GeoCohortMap - open for review [G2M...Map/GeoCohortMap - add setViewportBBox , setAp...0.8851630.935586
332react-maps/GeoCohortMap/Map - design both maps...Report/GeoCohort - use bbox coords to pull pos...0.8873650.8801060.9563540.917622
1104snoke - fix beacons list not loading in VWI jo...builder - fix issue on loading beacons for VWI...0.9264800.9323440.9193930.904968
\n", @@ -1230,125 +935,50 @@ ], "text/plain": [ " did \\\n", - "1390 Airflow - Migrate dev stage workflow to new cl... \n", - "385 QL - refactor modal setup \n", - "1413 Review the data select process design \n", - "355 atom - test Quebec cities with geocoder and ap... \n", - "1367 connector-gcs - create connection_hub_gcs_dev ... \n", - "1194 data - review https://github.com/EQWorks/ws-pr... \n", - "1217 data - review https://github.com/EQWorks/ws-pr... \n", - "1240 data - review https://github.com/EQWorks/ws-pr... \n", - "1263 data - review https://github.com/EQWorks/ws-pr... \n", - "552 design - add new buttons for rest/query/cancel... \n", - "1551 design/locus-ql - built new views popup sequen... \n", - "1505 design/locus-ql - incorporate team's feedback ... \n", - "1482 design/locus-ql - polish the user flow for sel... \n", - "880 design/tree-selector - project setup + init ta... \n", - "1436 locus-ql - make a user flow map for the views ... \n", - "1528 locus-ql - wireframe different user flows for ... \n", - "401 modal - explore popstate events & add default ... \n", - "520 modal/refactor - add reset modal configs \n", - "539 modal/refactor - cleanups + remove unused moda... \n", "638 modal/refactor - isolate Modal component \n", - "489 modal/refactor - isolate execution cancel moda... \n", - "477 modal/refactor - isolate query delete modal co... \n", - "699 modal/refactor - isolate query save modal configs \n", - "446 modal/refactor - isolate query save-success mo... \n", - "612 modal/refactor - re-style Card component \n", - "681 modal/refactor - re-style Textfield component \n", - "953 notion/journal - add name-transform for displa... \n", - "930 notion/journal - fix bug with template literals \n", - "987 notion/journal - setup db retrieving process \n", "1010 notion/journal - setup journal routine automat... \n", "240 overlord/GeoCohortMap - clean up and open for ... \n", - "194 overlord/geocohort - add all geocohorts to the... \n", - "125 overlord/geocohort - add extra tab for GeoCoho... \n", - "171 overlord/geocohort - adjust tooltipFormatX and... \n", - "56 overlord/geocohort - fix passing the clean lis... \n", - "79 overlord/geocohort - upgrade @eqworks/react-ma... \n", "286 overlord/geocohort - use bbox coords to pull p... \n", - "148 overseer - test last master branch with bbox a... \n", - "1012 product - review https://github.com/EQWorks/ws... \n", - "1137 python-curriculum/12 - elaborate on the use of... \n", - "1275 python-curriculum/12 - finalize/grammar cleanu... \n", - "1344 python-curriculum/12 - widgets (nested to sele... \n", - "424 ql-connect - ql design prototype overview + ui... \n", "14 react-maps - merge [G2M] Map/GeoCohortMap - ad... \n", - "34 react-maps- upgrade package and make new relea... \n", "221 react-maps/GeoCohortMap - open for review [G2M... \n", - "332 react-maps/GeoCohortMap/Map - design both maps... \n", "1104 snoke - fix beacons list not loading in VWI jo... \n", "\n", - " pr sim sim_norm \n", - "1390 Report/GeoCohort - use bbox coords to pull pos... 0.803692 0.740882 \n", - "385 Modal/refactor 0.701437 0.837877 \n", - "1413 Report/GeoCohort - use bbox coords to pull pos... 0.845225 0.807889 \n", - "355 Report/GeoCohort - use bbox coords to pull pos... 0.860691 0.853623 \n", - "1367 Report/GeoCohort - use bbox coords to pull pos... 0.834130 0.852461 \n", - "1194 journal - setup Notion dev-journal automation ... 0.844592 0.863798 \n", - "1217 journal - setup Notion dev-journal automation ... 0.849167 0.865895 \n", - "1240 journal - setup Notion dev-journal automation ... 0.847276 0.865414 \n", - "1263 journal - setup Notion dev-journal automation ... 0.845011 0.863669 \n", - "552 builder - fix issue on loading beacons for VWI... 0.845306 0.797208 \n", - "1551 Report/GeoCohort - use bbox coords to pull pos... 0.812339 0.813832 \n", - "1505 Report/GeoCohort - use bbox coords to pull pos... 0.854459 0.814493 \n", - "1482 Report/GeoCohort - use bbox coords to pull pos... 0.861800 0.848261 \n", - "880 New Airflow cluster + task priority + sensor t... 0.795424 0.813049 \n", - "1436 Report/GeoCohort - use bbox coords to pull pos... 0.846160 0.808592 \n", - "1528 Report/GeoCohort - use bbox coords to pull pos... 0.835963 0.819813 \n", - "401 Report/GeoCohort - use bbox coords to pull pos... 0.850113 0.829083 \n", - "520 Map/GeoCohortMap - add setViewportBBox , setAp... 0.769230 0.812446 \n", - "539 Report/GeoCohort - use bbox coords to pull pos... 0.811839 0.794998 \n", - "638 Modal/refactor 0.787151 0.842918 \n", - "489 New Airflow cluster + task priority + sensor t... 0.817502 0.802322 \n", - "477 Modal/refactor 0.771011 0.832558 \n", - "699 Adserver/Beacons - Add support for custom cont... 0.772091 0.805339 \n", - "446 Adserver/Beacons - Add support for custom cont... 0.789446 0.801402 \n", - "612 Map/GeoCohortMap - add setViewportBBox , setAp... 0.726390 0.804693 \n", - "681 Map/GeoCohortMap - add setViewportBBox , setAp... 0.759723 0.809706 \n", - "953 Report/GeoCohort - use bbox coords to pull pos... 0.845834 0.833265 \n", - "930 Report/GeoCohort - use bbox coords to pull pos... 0.810071 0.781882 \n", - "987 journal - setup Notion dev-journal automation ... 0.878858 0.872043 \n", - "1010 journal - setup Notion dev-journal automation ... 0.965812 0.967793 \n", - "240 Report/GeoCohort - use bbox coords to pull pos... 0.934373 0.954730 \n", - "194 Report/GeoCohort - use bbox coords to pull pos... 0.846638 0.801580 \n", - "125 Report/GeoCohort - use bbox coords to pull pos... 0.813227 0.807819 \n", - "171 Report/GeoCohort - use bbox coords to pull pos... 0.875953 0.853324 \n", - "56 Report/GeoCohort - use bbox coords to pull pos... 0.888429 0.893503 \n", - "79 Report/GeoCohort - use bbox coords to pull pos... 0.830706 0.787613 \n", - "286 Report/GeoCohort - use bbox coords to pull pos... 0.976856 0.975869 \n", - "148 Report/GeoCohort - use bbox coords to pull pos... 0.848460 0.807371 \n", - "1012 builder - fix issue on loading beacons for VWI... 0.816780 0.799459 \n", - "1137 Report/GeoCohort - use bbox coords to pull pos... 0.857342 0.837314 \n", - "1275 Report/GeoCohort - use bbox coords to pull pos... 0.857011 0.846689 \n", - "1344 Report/GeoCohort - use bbox coords to pull pos... 0.819885 0.762207 \n", - "424 Report/GeoCohort - use bbox coords to pull pos... 0.802533 0.813566 \n", - "14 Map/GeoCohortMap - add setViewportBBox , setAp... 0.862476 0.913966 \n", - "34 package - update package to v0.6.1 0.801621 0.769318 \n", - "221 Map/GeoCohortMap - add setViewportBBox , setAp... 0.885163 0.935586 \n", - "332 Report/GeoCohort - use bbox coords to pull pos... 0.887365 0.880106 \n", - "1104 builder - fix issue on loading beacons for VWI... 0.926480 0.932344 " + " pr sim sim_norm \\\n", + "638 Modal/refactor 0.787151 0.923675 \n", + "1010 journal - setup Notion dev-journal automation ... 0.965812 0.960012 \n", + "240 Report/GeoCohort - use bbox coords to pull pos... 0.934373 0.958219 \n", + "286 Report/GeoCohort - use bbox coords to pull pos... 0.976856 0.981589 \n", + "14 Map/GeoCohortMap - add setViewportBBox , setAp... 0.862476 0.941750 \n", + "221 Map/GeoCohortMap - add setViewportBBox , setAp... 0.885163 0.956354 \n", + "1104 builder - fix issue on loading beacons for VWI... 0.926480 0.919393 \n", + "\n", + " sim_tok \n", + "638 0.900247 \n", + "1010 0.952789 \n", + "240 0.942460 \n", + "286 0.977886 \n", + "14 0.902895 \n", + "221 0.917622 \n", + "1104 0.904968 " ] }, - "execution_count": 108, + "execution_count": 138, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# get the pair of pr, did with the highest sim score\n", - "idx = dff.groupby('did').sim.idxmax()\n", - "sim_pairs = df.iloc[idx]\n", - "sim_pairs" + "# get the pair of pr, did with the highest sim_norm score\n", + "idx = dff.groupby('did').sim_norm.idxmax()\n", + "norm_pairs = df.iloc[idx]\n", + "norm_pairs" ] }, { "cell_type": "code", - "execution_count": 109, - "id": "4da4fd60", - "metadata": { - "scrolled": false - }, + "execution_count": 139, + "id": "20583fe8", + "metadata": {}, "outputs": [ { "data": { @@ -1375,344 +1005,65 @@ " pr\n", " sim\n", " sim_norm\n", + " sim_tok\n", " \n", " \n", " \n", " \n", - " 1390\n", - " Airflow - Migrate dev stage workflow to new cl...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.803692\n", - " 0.740882\n", - " \n", - " \n", - " 385\n", - " QL - refactor modal setup\n", - " Modal/refactor\n", - " 0.701437\n", - " 0.837877\n", - " \n", - " \n", - " 1424\n", - " Review the data select process design\n", - " journal - setup Notion dev-journal automation ...\n", - " 0.794472\n", - " 0.833235\n", - " \n", - " \n", - " 355\n", - " atom - test Quebec cities with geocoder and ap...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.860691\n", - " 0.853623\n", - " \n", - " \n", - " 1367\n", - " connector-gcs - create connection_hub_gcs_dev ...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.834130\n", - " 0.852461\n", - " \n", - " \n", - " 1194\n", - " data - review https://github.com/EQWorks/ws-pr...\n", - " journal - setup Notion dev-journal automation ...\n", - " 0.844592\n", - " 0.863798\n", - " \n", - " \n", - " 1217\n", - " data - review https://github.com/EQWorks/ws-pr...\n", - " journal - setup Notion dev-journal automation ...\n", - " 0.849167\n", - " 0.865895\n", - " \n", - " \n", - " 1240\n", - " data - review https://github.com/EQWorks/ws-pr...\n", - " journal - setup Notion dev-journal automation ...\n", - " 0.847276\n", - " 0.865414\n", - " \n", - " \n", - " 1263\n", - " data - review https://github.com/EQWorks/ws-pr...\n", - " journal - setup Notion dev-journal automation ...\n", - " 0.845011\n", - " 0.863669\n", - " \n", - " \n", - " 566\n", - " design - add new buttons for rest/query/cancel...\n", - " Map/GeoCohortMap - add setViewportBBox , setAp...\n", - " 0.726100\n", - " 0.811494\n", - " \n", - " \n", - " 1551\n", - " design/locus-ql - built new views popup sequen...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.812339\n", - " 0.813832\n", - " \n", - " \n", - " 1505\n", - " design/locus-ql - incorporate team's feedback ...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.854459\n", - " 0.814493\n", - " \n", - " \n", - " 1482\n", - " design/locus-ql - polish the user flow for sel...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.861800\n", - " 0.848261\n", - " \n", - " \n", - " 880\n", - " design/tree-selector - project setup + init ta...\n", - " New Airflow cluster + task priority + sensor t...\n", - " 0.795424\n", - " 0.813049\n", - " \n", - " \n", - " 1436\n", - " locus-ql - make a user flow map for the views ...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.846160\n", - " 0.808592\n", - " \n", - " \n", - " 1528\n", - " locus-ql - wireframe different user flows for ...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.835963\n", - " 0.819813\n", - " \n", - " \n", - " 401\n", - " modal - explore popstate events & add default ...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.850113\n", - " 0.829083\n", - " \n", - " \n", - " 513\n", - " modal/refactor - add reset modal configs\n", - " Camp/Insights/GeoCohort - Add bounded box filt...\n", - " 0.762216\n", - " 0.815759\n", - " \n", - " \n", - " 536\n", - " modal/refactor - cleanups + remove unused moda...\n", - " Camp/Insights/GeoCohort - Add bounded box filt...\n", - " 0.782618\n", - " 0.808684\n", - " \n", - " \n", " 638\n", " modal/refactor - isolate Modal component\n", " Modal/refactor\n", " 0.787151\n", - " 0.842918\n", - " \n", - " \n", - " 489\n", - " modal/refactor - isolate execution cancel moda...\n", - " New Airflow cluster + task priority + sensor t...\n", - " 0.817502\n", - " 0.802322\n", - " \n", - " \n", - " 477\n", - " modal/refactor - isolate query delete modal co...\n", - " Modal/refactor\n", - " 0.771011\n", - " 0.832558\n", - " \n", - " \n", - " 707\n", - " modal/refactor - isolate query save modal configs\n", - " Modal/refactor\n", - " 0.765545\n", - " 0.827165\n", - " \n", - " \n", - " 454\n", - " modal/refactor - isolate query save-success mo...\n", - " Modal/refactor\n", - " 0.760989\n", - " 0.831344\n", - " \n", - " \n", - " 612\n", - " modal/refactor - re-style Card component\n", - " Map/GeoCohortMap - add setViewportBBox , setAp...\n", - " 0.726390\n", - " 0.804693\n", - " \n", - " \n", - " 681\n", - " modal/refactor - re-style Textfield component\n", - " Map/GeoCohortMap - add setViewportBBox , setAp...\n", - " 0.759723\n", - " 0.809706\n", - " \n", - " \n", - " 953\n", - " notion/journal - add name-transform for displa...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.845834\n", - " 0.833265\n", - " \n", - " \n", - " 930\n", - " notion/journal - fix bug with template literals\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.810071\n", - " 0.781882\n", - " \n", - " \n", - " 987\n", - " notion/journal - setup db retrieving process\n", - " journal - setup Notion dev-journal automation ...\n", - " 0.878858\n", - " 0.872043\n", + " 0.923675\n", + " 0.900247\n", " \n", " \n", " 1010\n", " notion/journal - setup journal routine automat...\n", " journal - setup Notion dev-journal automation ...\n", " 0.965812\n", - " 0.967793\n", + " 0.960012\n", + " 0.952789\n", " \n", " \n", " 240\n", " overlord/GeoCohortMap - clean up and open for ...\n", " Report/GeoCohort - use bbox coords to pull pos...\n", " 0.934373\n", - " 0.954730\n", - " \n", - " \n", - " 194\n", - " overlord/geocohort - add all geocohorts to the...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.846638\n", - " 0.801580\n", - " \n", - " \n", - " 129\n", - " overlord/geocohort - add extra tab for GeoCoho...\n", - " Map/GeoCohortMap - add setViewportBBox , setAp...\n", - " 0.721031\n", - " 0.816096\n", - " \n", - " \n", - " 171\n", - " overlord/geocohort - adjust tooltipFormatX and...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.875953\n", - " 0.853324\n", - " \n", - " \n", - " 56\n", - " overlord/geocohort - fix passing the clean lis...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.888429\n", - " 0.893503\n", - " \n", - " \n", - " 69\n", - " overlord/geocohort - upgrade @eqworks/react-ma...\n", - " builder - fix issue on loading beacons for VWI...\n", - " 0.817489\n", - " 0.806360\n", + " 0.958219\n", + " 0.942460\n", " \n", " \n", " 286\n", " overlord/geocohort - use bbox coords to pull p...\n", " Report/GeoCohort - use bbox coords to pull pos...\n", " 0.976856\n", - " 0.975869\n", - " \n", - " \n", - " 148\n", - " overseer - test last master branch with bbox a...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.848460\n", - " 0.807371\n", - " \n", - " \n", - " 1033\n", - " product - review https://github.com/EQWorks/ws...\n", - " journal - setup Notion dev-journal automation ...\n", - " 0.806734\n", - " 0.823701\n", - " \n", - " \n", - " 1137\n", - " python-curriculum/12 - elaborate on the use of...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.857342\n", - " 0.837314\n", - " \n", - " \n", - " 1275\n", - " python-curriculum/12 - finalize/grammar cleanu...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.857011\n", - " 0.846689\n", - " \n", - " \n", - " 1343\n", - " python-curriculum/12 - widgets (nested to sele...\n", - " Adserver/Beacons - Add support for custom cont...\n", - " 0.816493\n", - " 0.775432\n", - " \n", - " \n", - " 424\n", - " ql-connect - ql design prototype overview + ui...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.802533\n", - " 0.813566\n", + " 0.981589\n", + " 0.977886\n", " \n", " \n", " 14\n", " react-maps - merge [G2M] Map/GeoCohortMap - ad...\n", " Map/GeoCohortMap - add setViewportBBox , setAp...\n", " 0.862476\n", - " 0.913966\n", - " \n", - " \n", - " 34\n", - " react-maps- upgrade package and make new relea...\n", - " package - update package to v0.6.1\n", - " 0.801621\n", - " 0.769318\n", + " 0.941750\n", + " 0.902895\n", " \n", " \n", " 221\n", " react-maps/GeoCohortMap - open for review [G2M...\n", " Map/GeoCohortMap - add setViewportBBox , setAp...\n", " 0.885163\n", - " 0.935586\n", - " \n", - " \n", - " 332\n", - " react-maps/GeoCohortMap/Map - design both maps...\n", - " Report/GeoCohort - use bbox coords to pull pos...\n", - " 0.887365\n", - " 0.880106\n", + " 0.956354\n", + " 0.917622\n", " \n", " \n", " 1104\n", " snoke - fix beacons list not loading in VWI jo...\n", " builder - fix issue on loading beacons for VWI...\n", " 0.926480\n", - " 0.932344\n", + " 0.919393\n", + " 0.904968\n", " \n", " \n", "\n", @@ -1720,257 +1071,248 @@ ], "text/plain": [ " did \\\n", - "1390 Airflow - Migrate dev stage workflow to new cl... \n", - "385 QL - refactor modal setup \n", - "1424 Review the data select process design \n", - "355 atom - test Quebec cities with geocoder and ap... \n", - "1367 connector-gcs - create connection_hub_gcs_dev ... \n", - "1194 data - review https://github.com/EQWorks/ws-pr... \n", - "1217 data - review https://github.com/EQWorks/ws-pr... \n", - "1240 data - review https://github.com/EQWorks/ws-pr... \n", - "1263 data - review https://github.com/EQWorks/ws-pr... \n", - "566 design - add new buttons for rest/query/cancel... \n", - "1551 design/locus-ql - built new views popup sequen... \n", - "1505 design/locus-ql - incorporate team's feedback ... \n", - "1482 design/locus-ql - polish the user flow for sel... \n", - "880 design/tree-selector - project setup + init ta... \n", - "1436 locus-ql - make a user flow map for the views ... \n", - "1528 locus-ql - wireframe different user flows for ... \n", - "401 modal - explore popstate events & add default ... \n", - "513 modal/refactor - add reset modal configs \n", - "536 modal/refactor - cleanups + remove unused moda... \n", "638 modal/refactor - isolate Modal component \n", - "489 modal/refactor - isolate execution cancel moda... \n", - "477 modal/refactor - isolate query delete modal co... \n", - "707 modal/refactor - isolate query save modal configs \n", - "454 modal/refactor - isolate query save-success mo... \n", - "612 modal/refactor - re-style Card component \n", - "681 modal/refactor - re-style Textfield component \n", - "953 notion/journal - add name-transform for displa... \n", - "930 notion/journal - fix bug with template literals \n", - "987 notion/journal - setup db retrieving process \n", "1010 notion/journal - setup journal routine automat... \n", "240 overlord/GeoCohortMap - clean up and open for ... \n", - "194 overlord/geocohort - add all geocohorts to the... \n", - "129 overlord/geocohort - add extra tab for GeoCoho... \n", - "171 overlord/geocohort - adjust tooltipFormatX and... \n", - "56 overlord/geocohort - fix passing the clean lis... \n", - "69 overlord/geocohort - upgrade @eqworks/react-ma... \n", "286 overlord/geocohort - use bbox coords to pull p... \n", - "148 overseer - test last master branch with bbox a... \n", - "1033 product - review https://github.com/EQWorks/ws... \n", - "1137 python-curriculum/12 - elaborate on the use of... \n", - "1275 python-curriculum/12 - finalize/grammar cleanu... \n", - "1343 python-curriculum/12 - widgets (nested to sele... \n", - "424 ql-connect - ql design prototype overview + ui... \n", "14 react-maps - merge [G2M] Map/GeoCohortMap - ad... \n", - "34 react-maps- upgrade package and make new relea... \n", "221 react-maps/GeoCohortMap - open for review [G2M... \n", - "332 react-maps/GeoCohortMap/Map - design both maps... \n", "1104 snoke - fix beacons list not loading in VWI jo... \n", "\n", - " pr sim sim_norm \n", - "1390 Report/GeoCohort - use bbox coords to pull pos... 0.803692 0.740882 \n", - "385 Modal/refactor 0.701437 0.837877 \n", - "1424 journal - setup Notion dev-journal automation ... 0.794472 0.833235 \n", - "355 Report/GeoCohort - use bbox coords to pull pos... 0.860691 0.853623 \n", - "1367 Report/GeoCohort - use bbox coords to pull pos... 0.834130 0.852461 \n", - "1194 journal - setup Notion dev-journal automation ... 0.844592 0.863798 \n", - "1217 journal - setup Notion dev-journal automation ... 0.849167 0.865895 \n", - "1240 journal - setup Notion dev-journal automation ... 0.847276 0.865414 \n", - "1263 journal - setup Notion dev-journal automation ... 0.845011 0.863669 \n", - "566 Map/GeoCohortMap - add setViewportBBox , setAp... 0.726100 0.811494 \n", - "1551 Report/GeoCohort - use bbox coords to pull pos... 0.812339 0.813832 \n", - "1505 Report/GeoCohort - use bbox coords to pull pos... 0.854459 0.814493 \n", - "1482 Report/GeoCohort - use bbox coords to pull pos... 0.861800 0.848261 \n", - "880 New Airflow cluster + task priority + sensor t... 0.795424 0.813049 \n", - "1436 Report/GeoCohort - use bbox coords to pull pos... 0.846160 0.808592 \n", - "1528 Report/GeoCohort - use bbox coords to pull pos... 0.835963 0.819813 \n", - "401 Report/GeoCohort - use bbox coords to pull pos... 0.850113 0.829083 \n", - "513 Camp/Insights/GeoCohort - Add bounded box filt... 0.762216 0.815759 \n", - "536 Camp/Insights/GeoCohort - Add bounded box filt... 0.782618 0.808684 \n", - "638 Modal/refactor 0.787151 0.842918 \n", - "489 New Airflow cluster + task priority + sensor t... 0.817502 0.802322 \n", - "477 Modal/refactor 0.771011 0.832558 \n", - "707 Modal/refactor 0.765545 0.827165 \n", - "454 Modal/refactor 0.760989 0.831344 \n", - "612 Map/GeoCohortMap - add setViewportBBox , setAp... 0.726390 0.804693 \n", - "681 Map/GeoCohortMap - add setViewportBBox , setAp... 0.759723 0.809706 \n", - "953 Report/GeoCohort - use bbox coords to pull pos... 0.845834 0.833265 \n", - "930 Report/GeoCohort - use bbox coords to pull pos... 0.810071 0.781882 \n", - "987 journal - setup Notion dev-journal automation ... 0.878858 0.872043 \n", - "1010 journal - setup Notion dev-journal automation ... 0.965812 0.967793 \n", - "240 Report/GeoCohort - use bbox coords to pull pos... 0.934373 0.954730 \n", - "194 Report/GeoCohort - use bbox coords to pull pos... 0.846638 0.801580 \n", - "129 Map/GeoCohortMap - add setViewportBBox , setAp... 0.721031 0.816096 \n", - "171 Report/GeoCohort - use bbox coords to pull pos... 0.875953 0.853324 \n", - "56 Report/GeoCohort - use bbox coords to pull pos... 0.888429 0.893503 \n", - "69 builder - fix issue on loading beacons for VWI... 0.817489 0.806360 \n", - "286 Report/GeoCohort - use bbox coords to pull pos... 0.976856 0.975869 \n", - "148 Report/GeoCohort - use bbox coords to pull pos... 0.848460 0.807371 \n", - "1033 journal - setup Notion dev-journal automation ... 0.806734 0.823701 \n", - "1137 Report/GeoCohort - use bbox coords to pull pos... 0.857342 0.837314 \n", - "1275 Report/GeoCohort - use bbox coords to pull pos... 0.857011 0.846689 \n", - "1343 Adserver/Beacons - Add support for custom cont... 0.816493 0.775432 \n", - "424 Report/GeoCohort - use bbox coords to pull pos... 0.802533 0.813566 \n", - "14 Map/GeoCohortMap - add setViewportBBox , setAp... 0.862476 0.913966 \n", - "34 package - update package to v0.6.1 0.801621 0.769318 \n", - "221 Map/GeoCohortMap - add setViewportBBox , setAp... 0.885163 0.935586 \n", - "332 Report/GeoCohort - use bbox coords to pull pos... 0.887365 0.880106 \n", - "1104 builder - fix issue on loading beacons for VWI... 0.926480 0.932344 " + " pr sim sim_norm \\\n", + "638 Modal/refactor 0.787151 0.923675 \n", + "1010 journal - setup Notion dev-journal automation ... 0.965812 0.960012 \n", + "240 Report/GeoCohort - use bbox coords to pull pos... 0.934373 0.958219 \n", + "286 Report/GeoCohort - use bbox coords to pull pos... 0.976856 0.981589 \n", + "14 Map/GeoCohortMap - add setViewportBBox , setAp... 0.862476 0.941750 \n", + "221 Map/GeoCohortMap - add setViewportBBox , setAp... 0.885163 0.956354 \n", + "1104 builder - fix issue on loading beacons for VWI... 0.926480 0.919393 \n", + "\n", + " sim_tok \n", + "638 0.900247 \n", + "1010 0.952789 \n", + "240 0.942460 \n", + "286 0.977886 \n", + "14 0.902895 \n", + "221 0.917622 \n", + "1104 0.904968 " ] }, - "execution_count": 109, + "execution_count": 139, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# get the pair of pr, did with the highest sim_norm score\n", - "idx = dff.groupby('did').sim_norm.idxmax()\n", - "norm_pairs = df.iloc[idx]\n", - "norm_pairs" + "# get the pair of pr, did with the highest sim_tok score\n", + "idx = dff.groupby('did').sim_tok.idxmax()\n", + "tok_pairs = df.iloc[idx]\n", + "tok_pairs" ] }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 147, "id": "f46230ce", - "metadata": {}, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def check(df, query):\n", + " dedupe = df.query(query)\n", + " print(len(did), len(dedupe), f'{round(len(dedupe) / len(did) * 100, 3)}%')\n", + " printer(dedupe)" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "id": "8ae73229", + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "did: atom - test Quebec cities with geocoder and api calls and collaborate with Ianec to fix Quebec cities geom pull from DB\n", - "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", - "sim: 0.8606913685798645\n", - "sim_norm: 0.8536229133605957\n", - "---------- \n", - "\n", - "did: connector-gcs - create connection_hub_gcs_dev test bucket (under EQ Hyperlocal, no org google cloud proj)\n", - "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", - "sim: 0.8341295719146729\n", - "sim_norm: 0.8524613380432129\n", + "90 7 7.778%\n", + "did: modal/refactor - isolate Modal component\n", + "pr: Modal/refactor\n", + "sim: 0.7871510982513428\n", + "sim_norm: 0.9236753582954407\n", + "sim_tok: 0.9002470374107361\n", "---------- \n", "\n", - "did: data - review https://github.com/EQWorks/ws-problems/issues/127\n", + "did: notion/journal - setup journal routine automation workflows\n", "pr: journal - setup Notion dev-journal automation workflow\n", - "sim: 0.8445922136306763\n", - "sim_norm: 0.8637979626655579\n", + "sim: 0.9658119082450867\n", + "sim_norm: 0.9600117206573486\n", + "sim_tok: 0.9527885317802429\n", "---------- \n", "\n", - "did: data - review https://github.com/EQWorks/ws-problems/issues/145\n", - "pr: journal - setup Notion dev-journal automation workflow\n", - "sim: 0.8491669297218323\n", - "sim_norm: 0.8658953905105591\n", + "did: overlord/GeoCohortMap - clean up and open for review [G2M] Report/GeoCohort - use bbox coords to pull postal code level insights\n", + "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", + "sim: 0.9343725442886353\n", + "sim_norm: 0.9582189321517944\n", + "sim_tok: 0.9424603581428528\n", "---------- \n", "\n", - "did: data - review https://github.com/EQWorks/ws-problems/issues/150\n", - "pr: journal - setup Notion dev-journal automation workflow\n", - "sim: 0.8472760915756226\n", - "sim_norm: 0.8654137253761292\n", + "did: overlord/geocohort - use bbox coords to pull postal code level insights #1833\n", + "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", + "sim: 0.9768564105033875\n", + "sim_norm: 0.981589138507843\n", + "sim_tok: 0.977886438369751\n", "---------- \n", "\n", - "did: data - review https://github.com/EQWorks/ws-problems/issues/151\n", - "pr: journal - setup Notion dev-journal automation workflow\n", - "sim: 0.8450114727020264\n", - "sim_norm: 0.8636687397956848\n", + "did: react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master\n", + "pr: Map/GeoCohortMap - add setViewportBBox , setApiBBox\n", + "sim: 0.8624761700630188\n", + "sim_norm: 0.9417499899864197\n", + "sim_tok: 0.9028952121734619\n", "---------- \n", "\n", - "did: notion/journal - setup db retrieving process\n", - "pr: journal - setup Notion dev-journal automation workflow\n", - "sim: 0.8788578510284424\n", - "sim_norm: 0.8720427751541138\n", + "did: react-maps/GeoCohortMap - open for review [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76\n", + "pr: Map/GeoCohortMap - add setViewportBBox , setApiBBox\n", + "sim: 0.8851630091667175\n", + "sim_norm: 0.9563536047935486\n", + "sim_tok: 0.9176222085952759\n", "---------- \n", "\n", + "did: snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)\n", + "pr: builder - fix issue on loading beacons for VWI job creation\n", + "sim: 0.9264801144599915\n", + "sim_norm: 0.9193930625915527\n", + "sim_tok: 0.9049676060676575\n", + "---------- \n", + "\n" + ] + } + ], + "source": [ + "check(norm_pairs, 'sim_norm >= 0.9')" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "59ba9378", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "90 4 4.444%\n", "did: notion/journal - setup journal routine automation workflows\n", "pr: journal - setup Notion dev-journal automation workflow\n", "sim: 0.9658119082450867\n", - "sim_norm: 0.9677931666374207\n", + "sim_norm: 0.9600117206573486\n", + "sim_tok: 0.9527885317802429\n", "---------- \n", "\n", "did: overlord/GeoCohortMap - clean up and open for review [G2M] Report/GeoCohort - use bbox coords to pull postal code level insights\n", "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", "sim: 0.9343725442886353\n", - "sim_norm: 0.9547296166419983\n", + "sim_norm: 0.9582189321517944\n", + "sim_tok: 0.9424603581428528\n", "---------- \n", "\n", - "did: overlord/geocohort - adjust tooltipFormatX and axisBottomLabelDisplayFn for dates in geocohort aggregated data #1834\n", + "did: overlord/geocohort - use bbox coords to pull postal code level insights #1833\n", "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", - "sim: 0.875952959060669\n", - "sim_norm: 0.8533236384391785\n", + "sim: 0.9768564105033875\n", + "sim_norm: 0.981589138507843\n", + "sim_tok: 0.977886438369751\n", "---------- \n", "\n", - "did: overlord/geocohort - fix passing the clean list of GeoCohortFSA aggregated data to map #1833\n", + "did: snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)\n", + "pr: builder - fix issue on loading beacons for VWI job creation\n", + "sim: 0.9264801144599915\n", + "sim_norm: 0.9193930625915527\n", + "sim_tok: 0.9049676060676575\n", + "---------- \n", + "\n" + ] + } + ], + "source": [ + "check(sim_pairs, 'sim >= 0.9')" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "id": "6eec7e1d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "90 7 7.778%\n", + "did: modal/refactor - isolate Modal component\n", + "pr: Modal/refactor\n", + "sim: 0.7871510982513428\n", + "sim_norm: 0.9236753582954407\n", + "sim_tok: 0.9002470374107361\n", + "---------- \n", + "\n", + "did: notion/journal - setup journal routine automation workflows\n", + "pr: journal - setup Notion dev-journal automation workflow\n", + "sim: 0.9658119082450867\n", + "sim_norm: 0.9600117206573486\n", + "sim_tok: 0.9527885317802429\n", + "---------- \n", + "\n", + "did: overlord/GeoCohortMap - clean up and open for review [G2M] Report/GeoCohort - use bbox coords to pull postal code level insights\n", "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", - "sim: 0.888428807258606\n", - "sim_norm: 0.8935034275054932\n", + "sim: 0.9343725442886353\n", + "sim_norm: 0.9582189321517944\n", + "sim_tok: 0.9424603581428528\n", "---------- \n", "\n", "did: overlord/geocohort - use bbox coords to pull postal code level insights #1833\n", "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", "sim: 0.9768564105033875\n", - "sim_norm: 0.975869357585907\n", + "sim_norm: 0.981589138507843\n", + "sim_tok: 0.977886438369751\n", "---------- \n", "\n", "did: react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master\n", "pr: Map/GeoCohortMap - add setViewportBBox , setApiBBox\n", "sim: 0.8624761700630188\n", - "sim_norm: 0.9139657616615295\n", + "sim_norm: 0.9417499899864197\n", + "sim_tok: 0.9028952121734619\n", "---------- \n", "\n", "did: react-maps/GeoCohortMap - open for review [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76\n", "pr: Map/GeoCohortMap - add setViewportBBox , setApiBBox\n", "sim: 0.8851630091667175\n", - "sim_norm: 0.9355862736701965\n", - "---------- \n", - "\n", - "did: react-maps/GeoCohortMap/Map - design both maps to send out bbox coords of the current viewport #76\n", - "pr: Report/GeoCohort - use bbox coords to pull postal code level insights\n", - "sim: 0.8873649835586548\n", - "sim_norm: 0.88010573387146\n", + "sim_norm: 0.9563536047935486\n", + "sim_tok: 0.9176222085952759\n", "---------- \n", "\n", "did: snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)\n", "pr: builder - fix issue on loading beacons for VWI job creation\n", "sim: 0.9264801144599915\n", - "sim_norm: 0.9323441386222839\n", + "sim_norm: 0.9193930625915527\n", + "sim_tok: 0.9049676060676575\n", "---------- \n", "\n" ] } ], "source": [ - "dedupe = norm_pairs.query('sim_norm >= 0.85')\n", - "printer(dedupe)" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "id": "fd79330b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(90, 16)" - ] - }, - "execution_count": 111, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(did), len(dedupe)" + "check(tok_pairs, 'sim_tok >= 0.9')" ] }, { "cell_type": "code", "execution_count": null, - "id": "59ba9378", + "id": "735fa7b6", "metadata": {}, "outputs": [], "source": []