From c0ba14977fe33eef681e21da2b932c4ee1f3a4e0 Mon Sep 17 00:00:00 2001 From: Ramon Melo Date: Sun, 16 May 2021 19:08:23 -0300 Subject: [PATCH 1/5] Virtual envinronment setup. --- Pipfile | 15 ++ Pipfile.lock | 494 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 509 insertions(+) create mode 100644 Pipfile create mode 100644 Pipfile.lock diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..73782d2 --- /dev/null +++ b/Pipfile @@ -0,0 +1,15 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +scrapy = "*" +scrapyd = "*" +pymongo = "*" +attrs = "*" + +[dev-packages] + +[requires] +python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..03eb33a --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,494 @@ +{ + "_meta": { + "hash": { + "sha256": "e367ec51f608550f043a8291a576a56108f5927961aebe8e394ce6d6608693aa" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.9" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "attrs": { + "hashes": [ + "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1", + "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb" + ], + "index": "pypi", + "version": "==21.2.0" + }, + "automat": { + "hashes": [ + "sha256:7979803c74610e11ef0c0d68a2942b152df52da55336e0c9d58daf1831cbdf33", + "sha256:b6feb6455337df834f6c9962d6ccf771515b7d939bca142b29c20c2376bc6111" + ], + "version": "==20.2.0" + }, + "cffi": { + "hashes": [ + "sha256:005a36f41773e148deac64b08f233873a4d0c18b053d37da83f6af4d9087b813", + "sha256:0857f0ae312d855239a55c81ef453ee8fd24136eaba8e87a2eceba644c0d4c06", + "sha256:1071534bbbf8cbb31b498d5d9db0f274f2f7a865adca4ae429e147ba40f73dea", + "sha256:158d0d15119b4b7ff6b926536763dc0714313aa59e320ddf787502c70c4d4bee", + "sha256:1f436816fc868b098b0d63b8920de7d208c90a67212546d02f84fe78a9c26396", + "sha256:2894f2df484ff56d717bead0a5c2abb6b9d2bf26d6960c4604d5c48bbc30ee73", + "sha256:29314480e958fd8aab22e4a58b355b629c59bf5f2ac2492b61e3dc06d8c7a315", + "sha256:34eff4b97f3d982fb93e2831e6750127d1355a923ebaeeb565407b3d2f8d41a1", + "sha256:35f27e6eb43380fa080dccf676dece30bef72e4a67617ffda586641cd4508d49", + "sha256:3d3dd4c9e559eb172ecf00a2a7517e97d1e96de2a5e610bd9b68cea3925b4892", + "sha256:43e0b9d9e2c9e5d152946b9c5fe062c151614b262fda2e7b201204de0b99e482", + "sha256:48e1c69bbacfc3d932221851b39d49e81567a4d4aac3b21258d9c24578280058", + "sha256:51182f8927c5af975fece87b1b369f722c570fe169f9880764b1ee3bca8347b5", + "sha256:58e3f59d583d413809d60779492342801d6e82fefb89c86a38e040c16883be53", + "sha256:5de7970188bb46b7bf9858eb6890aad302577a5f6f75091fd7cdd3ef13ef3045", + "sha256:65fa59693c62cf06e45ddbb822165394a288edce9e276647f0046e1ec26920f3", + "sha256:69e395c24fc60aad6bb4fa7e583698ea6cc684648e1ffb7fe85e3c1ca131a7d5", + "sha256:6c97d7350133666fbb5cf4abdc1178c812cb205dc6f41d174a7b0f18fb93337e", + "sha256:6e4714cc64f474e4d6e37cfff31a814b509a35cb17de4fb1999907575684479c", + "sha256:72d8d3ef52c208ee1c7b2e341f7d71c6fd3157138abf1a95166e6165dd5d4369", + "sha256:8ae6299f6c68de06f136f1f9e69458eae58f1dacf10af5c17353eae03aa0d827", + "sha256:8b198cec6c72df5289c05b05b8b0969819783f9418e0409865dac47288d2a053", + "sha256:99cd03ae7988a93dd00bcd9d0b75e1f6c426063d6f03d2f90b89e29b25b82dfa", + "sha256:9cf8022fb8d07a97c178b02327b284521c7708d7c71a9c9c355c178ac4bbd3d4", + "sha256:9de2e279153a443c656f2defd67769e6d1e4163952b3c622dcea5b08a6405322", + "sha256:9e93e79c2551ff263400e1e4be085a1210e12073a31c2011dbbda14bda0c6132", + "sha256:9ff227395193126d82e60319a673a037d5de84633f11279e336f9c0f189ecc62", + "sha256:a465da611f6fa124963b91bf432d960a555563efe4ed1cc403ba5077b15370aa", + "sha256:ad17025d226ee5beec591b52800c11680fca3df50b8b29fe51d882576e039ee0", + "sha256:afb29c1ba2e5a3736f1c301d9d0abe3ec8b86957d04ddfa9d7a6a42b9367e396", + "sha256:b85eb46a81787c50650f2392b9b4ef23e1f126313b9e0e9013b35c15e4288e2e", + "sha256:bb89f306e5da99f4d922728ddcd6f7fcebb3241fc40edebcb7284d7514741991", + "sha256:cbde590d4faaa07c72bf979734738f328d239913ba3e043b1e98fe9a39f8b2b6", + "sha256:cd2868886d547469123fadc46eac7ea5253ea7fcb139f12e1dfc2bbd406427d1", + "sha256:d42b11d692e11b6634f7613ad8df5d6d5f8875f5d48939520d351007b3c13406", + "sha256:f2d45f97ab6bb54753eab54fffe75aaf3de4ff2341c9daee1987ee1837636f1d", + "sha256:fd78e5fee591709f32ef6edb9a015b4aa1a5022598e36227500c8f4e02328d9c" + ], + "version": "==1.14.5" + }, + "constantly": { + "hashes": [ + "sha256:586372eb92059873e29eba4f9dec8381541b4d3834660707faf8ba59146dfc35", + "sha256:dd2fa9d6b1a51a83f0d7dd76293d734046aa176e384bf6e33b7e44880eb37c5d" + ], + "version": "==15.1.0" + }, + "cryptography": { + "hashes": [ + "sha256:0f1212a66329c80d68aeeb39b8a16d54ef57071bf22ff4e521657b27372e327d", + "sha256:1e056c28420c072c5e3cb36e2b23ee55e260cb04eee08f702e0edfec3fb51959", + "sha256:240f5c21aef0b73f40bb9f78d2caff73186700bf1bc6b94285699aff98cc16c6", + "sha256:26965837447f9c82f1855e0bc8bc4fb910240b6e0d16a664bb722df3b5b06873", + "sha256:37340614f8a5d2fb9aeea67fd159bfe4f5f4ed535b1090ce8ec428b2f15a11f2", + "sha256:3d10de8116d25649631977cb37da6cbdd2d6fa0e0281d014a5b7d337255ca713", + "sha256:3d8427734c781ea5f1b41d6589c293089704d4759e34597dce91014ac125aad1", + "sha256:7ec5d3b029f5fa2b179325908b9cd93db28ab7b85bb6c1db56b10e0b54235177", + "sha256:8e56e16617872b0957d1c9742a3f94b43533447fd78321514abbe7db216aa250", + "sha256:de4e5f7f68220d92b7637fc99847475b59154b7a1b3868fb7385337af54ac9ca", + "sha256:eb8cc2afe8b05acbd84a43905832ec78e7b3873fb124ca190f574dca7389a87d", + "sha256:ee77aa129f481be46f8d92a1a7db57269a2f23052d5f2433b4621bb457081cc9" + ], + "markers": "python_version >= '3.6'", + "version": "==3.4.7" + }, + "cssselect": { + "hashes": [ + "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf", + "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.1.0" + }, + "h2": { + "hashes": [ + "sha256:61e0f6601fa709f35cdb730863b4e5ec7ad449792add80d1410d4174ed139af5", + "sha256:875f41ebd6f2c44781259005b157faed1a5031df3ae5aa7bcb4628a6c0782f14" + ], + "version": "==3.2.0" + }, + "hpack": { + "hashes": [ + "sha256:0edd79eda27a53ba5be2dfabf3b15780928a0dff6eb0c60a3d6767720e970c89", + "sha256:8eec9c1f4bfae3408a3f30500261f7e6a65912dc138526ea054f9ad98892e9d2" + ], + "version": "==3.0.0" + }, + "hyperframe": { + "hashes": [ + "sha256:5187962cb16dcc078f23cb5a4b110098d546c3f41ff2d4038a9896893bbd0b40", + "sha256:a9f5c17f2cc3c719b917c4f33ed1c61bd1f8dfac4b1bd23b7c80b3400971b41f" + ], + "version": "==5.2.0" + }, + "hyperlink": { + "hashes": [ + "sha256:427af957daa58bc909471c6c40f74c5450fa123dd093fc53efd2e91d2705a56b", + "sha256:e6b14c37ecb73e89c77d78cdb4c2cc8f3fb59a885c5b3f819ff4ed80f25af1b4" + ], + "version": "==21.0.0" + }, + "idna": { + "hashes": [ + "sha256:5205d03e7bcbb919cc9c19885f9920d622ca52448306f2377daede5cf3faac16", + "sha256:c5b02147e01ea9920e6b0a3f1f7bb833612d507592c837a6c49552768f4054e1" + ], + "markers": "python_version >= '3.4'", + "version": "==3.1" + }, + "incremental": { + "hashes": [ + "sha256:02f5de5aff48f6b9f665d99d48bfc7ec03b6e3943210de7cfc88856d755d6f57", + "sha256:92014aebc6a20b78a8084cdd5645eeaa7f74b8933f70fa3ada2cfbd1e3b54321" + ], + "version": "==21.3.0" + }, + "itemadapter": { + "hashes": [ + "sha256:5327c2136353cb965b6b4ba564af002fd458691b8e30d3bd6b14c474d92c6b25", + "sha256:cb7aaa577fefe2aa6f229ccf4d058e05f44e0178a98c8fb70ee4d95acfabb423" + ], + "markers": "python_version >= '3.6'", + "version": "==0.2.0" + }, + "itemloaders": { + "hashes": [ + "sha256:1277cd8ca3e4c02dcdfbc1bcae9134ad89acfa6041bd15b4561c6290203a0c96", + "sha256:4cb46a0f8915e910c770242ae3b60b1149913ed37162804f1e40e8535d6ec497" + ], + "markers": "python_version >= '3.6'", + "version": "==1.0.4" + }, + "jmespath": { + "hashes": [ + "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", + "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f" + ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.10.0" + }, + "lxml": { + "hashes": [ + "sha256:079f3ae844f38982d156efce585bc540c16a926d4436712cf4baee0cce487a3d", + "sha256:0fbcf5565ac01dff87cbfc0ff323515c823081c5777a9fc7703ff58388c258c3", + "sha256:122fba10466c7bd4178b07dba427aa516286b846b2cbd6f6169141917283aae2", + "sha256:1b38116b6e628118dea5b2186ee6820ab138dbb1e24a13e478490c7db2f326ae", + "sha256:1b7584d421d254ab86d4f0b13ec662a9014397678a7c4265a02a6d7c2b18a75f", + "sha256:26e761ab5b07adf5f555ee82fb4bfc35bf93750499c6c7614bd64d12aaa67927", + "sha256:289e9ca1a9287f08daaf796d96e06cb2bc2958891d7911ac7cae1c5f9e1e0ee3", + "sha256:2a9d50e69aac3ebee695424f7dbd7b8c6d6eb7de2a2eb6b0f6c7db6aa41e02b7", + "sha256:3082c518be8e97324390614dacd041bb1358c882d77108ca1957ba47738d9d59", + "sha256:33bb934a044cf32157c12bfcfbb6649807da20aa92c062ef51903415c704704f", + "sha256:3439c71103ef0e904ea0a1901611863e51f50b5cd5e8654a151740fde5e1cade", + "sha256:36108c73739985979bf302006527cf8a20515ce444ba916281d1c43938b8bb96", + "sha256:39b78571b3b30645ac77b95f7c69d1bffc4cf8c3b157c435a34da72e78c82468", + "sha256:4289728b5e2000a4ad4ab8da6e1db2e093c63c08bdc0414799ee776a3f78da4b", + "sha256:4bff24dfeea62f2e56f5bab929b4428ae6caba2d1eea0c2d6eb618e30a71e6d4", + "sha256:4c61b3a0db43a1607d6264166b230438f85bfed02e8cff20c22e564d0faff354", + "sha256:542d454665a3e277f76954418124d67516c5f88e51a900365ed54a9806122b83", + "sha256:5a0a14e264069c03e46f926be0d8919f4105c1623d620e7ec0e612a2e9bf1c04", + "sha256:5c8c163396cc0df3fd151b927e74f6e4acd67160d6c33304e805b84293351d16", + "sha256:66e575c62792c3f9ca47cb8b6fab9e35bab91360c783d1606f758761810c9791", + "sha256:6f12e1427285008fd32a6025e38e977d44d6382cf28e7201ed10d6c1698d2a9a", + "sha256:74f7d8d439b18fa4c385f3f5dfd11144bb87c1da034a466c5b5577d23a1d9b51", + "sha256:7610b8c31688f0b1be0ef882889817939490a36d0ee880ea562a4e1399c447a1", + "sha256:76fa7b1362d19f8fbd3e75fe2fb7c79359b0af8747e6f7141c338f0bee2f871a", + "sha256:7728e05c35412ba36d3e9795ae8995e3c86958179c9770e65558ec3fdfd3724f", + "sha256:8157dadbb09a34a6bd95a50690595e1fa0af1a99445e2744110e3dca7831c4ee", + "sha256:820628b7b3135403540202e60551e741f9b6d3304371712521be939470b454ec", + "sha256:884ab9b29feaca361f7f88d811b1eea9bfca36cf3da27768d28ad45c3ee6f969", + "sha256:89b8b22a5ff72d89d48d0e62abb14340d9e99fd637d046c27b8b257a01ffbe28", + "sha256:92e821e43ad382332eade6812e298dc9701c75fe289f2a2d39c7960b43d1e92a", + "sha256:b007cbb845b28db4fb8b6a5cdcbf65bacb16a8bd328b53cbc0698688a68e1caa", + "sha256:bc4313cbeb0e7a416a488d72f9680fffffc645f8a838bd2193809881c67dd106", + "sha256:bccbfc27563652de7dc9bdc595cb25e90b59c5f8e23e806ed0fd623755b6565d", + "sha256:c47ff7e0a36d4efac9fd692cfa33fbd0636674c102e9e8d9b26e1b93a94e7617", + "sha256:c4f05c5a7c49d2fb70223d0d5bcfbe474cf928310ac9fa6a7c6dddc831d0b1d4", + "sha256:cdaf11d2bd275bf391b5308f86731e5194a21af45fbaaaf1d9e8147b9160ea92", + "sha256:ce256aaa50f6cc9a649c51be3cd4ff142d67295bfc4f490c9134d0f9f6d58ef0", + "sha256:d2e35d7bf1c1ac8c538f88d26b396e73dd81440d59c1ef8522e1ea77b345ede4", + "sha256:d916d31fd85b2f78c76400d625076d9124de3e4bda8b016d25a050cc7d603f24", + "sha256:df7c53783a46febb0e70f6b05df2ba104610f2fb0d27023409734a3ecbb78fb2", + "sha256:e1cbd3f19a61e27e011e02f9600837b921ac661f0c40560eefb366e4e4fb275e", + "sha256:efac139c3f0bf4f0939f9375af4b02c5ad83a622de52d6dfa8e438e8e01d0eb0", + "sha256:efd7a09678fd8b53117f6bae4fa3825e0a22b03ef0a932e070c0bdbb3a35e654", + "sha256:f2380a6376dfa090227b663f9678150ef27543483055cc327555fb592c5967e2", + "sha256:f8380c03e45cf09f8557bdaa41e1fa7c81f3ae22828e1db470ab2a6c96d8bc23", + "sha256:f90ba11136bfdd25cae3951af8da2e95121c9b9b93727b1b896e3fa105b2f586" + ], + "markers": "platform_python_implementation == 'CPython'", + "version": "==4.6.3" + }, + "parsel": { + "hashes": [ + "sha256:70efef0b651a996cceebc69e55a85eb2233be0890959203ba7c3a03c72725c79", + "sha256:9e1fa8db1c0b4a878bf34b35c043d89c9d1cbebc23b4d34dbc3c0ec33f2e087d" + ], + "version": "==1.6.0" + }, + "priority": { + "hashes": [ + "sha256:6bc1961a6d7fcacbfc337769f1a382c8e746566aaa365e78047abe9f66b2ffbe", + "sha256:be4fcb94b5e37cdeb40af5533afe6dd603bd665fe9c8b3052610fc1001d5d1eb" + ], + "version": "==1.3.0" + }, + "protego": { + "hashes": [ + "sha256:a682771bc7b51b2ff41466460896c1a5a653f9a1e71639ef365a72e66d8734b4" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==0.1.16" + }, + "pyasn1": { + "hashes": [ + "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359", + "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576", + "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf", + "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7", + "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d", + "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00", + "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8", + "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86", + "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12", + "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776", + "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba", + "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2", + "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3" + ], + "version": "==0.4.8" + }, + "pyasn1-modules": { + "hashes": [ + "sha256:0845a5582f6a02bb3e1bde9ecfc4bfcae6ec3210dd270522fee602365430c3f8", + "sha256:0fe1b68d1e486a1ed5473f1302bd991c1611d319bba158e98b106ff86e1d7199", + "sha256:15b7c67fabc7fc240d87fb9aabf999cf82311a6d6fb2c70d00d3d0604878c811", + "sha256:426edb7a5e8879f1ec54a1864f16b882c2837bfd06eee62f2c982315ee2473ed", + "sha256:65cebbaffc913f4fe9e4808735c95ea22d7a7775646ab690518c056784bc21b4", + "sha256:905f84c712230b2c592c19470d3ca8d552de726050d1d1716282a1f6146be65e", + "sha256:a50b808ffeb97cb3601dd25981f6b016cbb3d31fbf57a8b8a87428e6158d0c74", + "sha256:a99324196732f53093a84c4369c996713eb8c89d360a496b599fb1a9c47fc3eb", + "sha256:b80486a6c77252ea3a3e9b1e360bc9cf28eaac41263d173c032581ad2f20fe45", + "sha256:c29a5e5cc7a3f05926aff34e097e84f8589cd790ce0ed41b67aed6857b26aafd", + "sha256:cbac4bc38d117f2a49aeedec4407d23e8866ea4ac27ff2cf7fb3e5b570df19e0", + "sha256:f39edd8c4ecaa4556e989147ebf219227e2cd2e8a43c7e7fcb1f1c18c5fd6a3d", + "sha256:fe0644d9ab041506b62782e92b06b8c68cca799e1a9636ec398675459e031405" + ], + "version": "==0.2.8" + }, + "pycparser": { + "hashes": [ + "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0", + "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.20" + }, + "pydispatcher": { + "hashes": [ + "sha256:5570069e1b1769af1fe481de6dd1d3a388492acddd2cdad7a3bde145615d5caf", + "sha256:5be4a8be12805ef7d712dd9a93284fb8bc53f309867e573f653a72e5fd10e433" + ], + "markers": "platform_python_implementation == 'CPython'", + "version": "==2.0.5" + }, + "pymongo": { + "hashes": [ + "sha256:03be7ad107d252bb7325d4af6309fdd2c025d08854d35f0e7abc8bf048f4245e", + "sha256:071552b065e809d24c5653fcc14968cfd6fde4e279408640d5ac58e3353a3c5f", + "sha256:08b8723248730599c9803ae4c97b8f3f76c55219104303c88cb962a31e3bb5ee", + "sha256:08bda7b2c522ff9f1e554570da16298271ebb0c56ab9699446aacba249008988", + "sha256:0aaf4d44f1f819360f9432df538d54bbf850f18152f34e20337c01b828479171", + "sha256:0cabfc297f4cf921f15bc789a8fbfd7115eb9f813d3f47a74b609894bc66ab0d", + "sha256:13acf6164ead81c9fc2afa0e1ea6d6134352973ce2bb35496834fee057063c04", + "sha256:15b083d1b789b230e5ac284442d9ecb113c93f3785a6824f748befaab803b812", + "sha256:161fcd3281c42f644aa8dec7753cca2af03ce654e17d76da4f0dab34a12480ca", + "sha256:1a994a42f49dab5b6287e499be7d3d2751776486229980d8857ad53b8333d469", + "sha256:20d75ea11527331a2980ab04762a9d960bcfea9475c54bbeab777af880de61cd", + "sha256:225c61e08fe517aede7912937939e09adf086c8e6f7e40d4c85ad678c2c2aea3", + "sha256:3135dd574ef1286189f3f04a36c8b7a256376914f8cbbce66b94f13125ded858", + "sha256:3491c7de09e44eded16824cb58cf9b5cc1dc6f066a0bb7aa69929d02aa53b828", + "sha256:3551912f5c34d8dd7c32c6bb00ae04192af47f7b9f653608f107d19c1a21a194", + "sha256:38a7b5140a48fc91681cdb5cb95b7cd64640b43d19259fdd707fa9d5a715f2b2", + "sha256:3a3498a8326111221560e930f198b495ea6926937e249f475052ffc6893a6680", + "sha256:3bfc7689a1bacb9bcd2f2d5185d99507aa29f667a58dd8adaa43b5a348139e46", + "sha256:421d13523d11c57f57f257152bc4a6bb463aadf7a3918e9c96fefdd6be8dbfb8", + "sha256:424799c71ff435094e5fb823c40eebb4500f0e048133311e9c026467e8ccebac", + "sha256:474e21d0e07cd09679e357d1dac76e570dab86665e79a9d3354b10a279ac6fb3", + "sha256:4c7e8c8e1e1918dcf6a652ac4b9d87164587c26fd2ce5dd81e73a5ab3b3d492f", + "sha256:506a6dab4c7ffdcacdf0b8e70bd20eb2e77fa994519547c9d88d676400fcad58", + "sha256:510cd3bfabb63a07405b7b79fae63127e34c118b7531a2cbbafc7a24fd878594", + "sha256:517ba47ca04a55b1f50ee8df9fd97f6c37df5537d118fb2718952b8623860466", + "sha256:539d4cb1b16b57026999c53e5aab857fe706e70ae5310cc8c232479923f932e6", + "sha256:5c36428cc4f7fae56354db7f46677fd21222fc3cb1e8829549b851172033e043", + "sha256:5db59223ed1e634d842a053325f85f908359c6dac9c8ddce8ef145061fae7df8", + "sha256:5e606846c049ed40940524057bfdf1105af6066688c0e6a1a3ce2038589bae70", + "sha256:6060794aac9f7b0644b299f46a9c6cbc0bc470bd01572f4134df140afd41ded6", + "sha256:62c29bc36a6d9be68fe7b5aaf1e120b4aa66a958d1e146601fcd583eb12cae7b", + "sha256:73326b211e7410c8bd6a74500b1e3f392f39cf10862e243d00937e924f112c01", + "sha256:78f07961f4f214ea8e80be63cffd5cc158eb06cd922ffbf6c7155b11728f28f9", + "sha256:7c97554ea521f898753d9773891d0347ebfaddcc1dee2ad94850b163171bf1f1", + "sha256:8898f6699f740ca93a0879ed07d8e6db02d68af889d0ebb3d13ab017e6b1af1e", + "sha256:8a41fdc751dc4707a4fafb111c442411816a7c225ebb5cadb57599534b5d5372", + "sha256:8e0004b0393d72d76de94b4792a006cb960c1c65c7659930fbf9a81ce4341982", + "sha256:977b1d4f868986b4ba5d03c317fde4d3b66e687d74473130cd598e3103db34fa", + "sha256:9a4f6e0b01df820ba9ed0b4e618ca83a1c089e48d4f268d0e00dcd49893d4549", + "sha256:9b9298964389c180a063a9e8bac8a80ed42de11d04166b20249bfa0a489e0e0f", + "sha256:a08c8b322b671857c81f4c30cd3c8df2895fd3c0e9358714f39e0ef8fb327702", + "sha256:ad31f184dcd3271de26ab1f9c51574afb99e1b0e484ab1da3641256b723e4994", + "sha256:aff3656af2add93f290731a6b8930b23b35c0c09569150130a58192b3ec6fc61", + "sha256:b2f41261b648cf5dee425f37ff14f4ad151c2f24b827052b402637158fd056ef", + "sha256:b413117210fa6d92664c3d860571e8e8727c3e8f2ff197276c5d0cb365abd3ad", + "sha256:b7efc7e7049ef366777cfd35437c18a4166bb50a5606a1c840ee3b9624b54fc9", + "sha256:b8f94acd52e530a38f25e4d5bf7ddfdd4bea9193e718f58419def0d4406b58d3", + "sha256:d0a70151d7de8a3194cdc906bcc1a42e14594787c64b0c1c9c975e5a2af3e251", + "sha256:d360e5d5dd3d55bf5d1776964625018d85b937d1032bae1926dd52253decd0db", + "sha256:d4e62417e89b717a7bcd8576ac3108cd063225942cc91c5b37ff5465fdccd386", + "sha256:d65bac5f6724d9ea6f0b5a0f0e4952fbbf209adcf6b5583b54c54bd2fcd74dc0", + "sha256:e02beaab433fd1104b2804f909e694cfbdb6578020740a9051597adc1cd4e19f", + "sha256:e4b631688dfbdd61b5610e20b64b99d25771c6d52d9da73349342d2a0f11c46a", + "sha256:e4e9db78b71db2b1684ee4ecc3e32c4600f18cdf76e6b9ae03e338e52ee4b168", + "sha256:eb4d176394c37a76e8b0afe54b12d58614a67a60a7f8c0dd3a5afbb013c01092", + "sha256:f08665d3cc5abc2f770f472a9b5f720a9b3ab0b8b3bb97c7c1487515e5653d39", + "sha256:f3d851af3852f16ad4adc7ee054fd9c90a7a5063de94d815b7f6a88477b9f4c6", + "sha256:f4ba58157e8ae33ee86fadf9062c506e535afd904f07f9be32731f4410a23b7f", + "sha256:f664ed7613b8b18f0ce5696b146776266a038c19c5cd6efffa08ecc189b01b73", + "sha256:f947b359cc4769af8b49be7e37af01f05fcf15b401da2528021148e4a54426d1", + "sha256:fe4189846448df013cd9df11bba38ddf78043f8c290a9f06430732a7a8601cce", + "sha256:fea5cb1c63efe1399f0812532c7cf65458d38fd011be350bc5021dfcac39fba8", + "sha256:fedf0dee7a412ca6d1d6d92c158fe9cbaa8ea0cae90d268f9ccc0744de7a97d0", + "sha256:fffff7bfb6799a763d3742c59c6ee7ffadda21abed557637bc44ed1080876484" + ], + "index": "pypi", + "version": "==3.11.4" + }, + "pyopenssl": { + "hashes": [ + "sha256:4c231c759543ba02560fcd2480c48dcec4dae34c9da7d3747c508227e0624b51", + "sha256:818ae18e06922c066f777a33f1fca45786d85edfe71cd043de6379337a7f274b" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==20.0.1" + }, + "queuelib": { + "hashes": [ + "sha256:631d067c9be57e395c382d680d3653ca1452cd29e8da25c5e8d94b5c0c528c31", + "sha256:90ee30ebb0b57112606358b63c09a681bbb9a7dd1120af09c836b475504cea85" + ], + "version": "==1.6.1" + }, + "scrapy": { + "hashes": [ + "sha256:0a68ed41f7173679f160c4cef2db05288548c21e7164170552adae8b13cefaab", + "sha256:5f590fdc84b496e5a4bb5ef99836b0aa688a07cfcb4bc3bb7290f66486f27424" + ], + "index": "pypi", + "version": "==2.5.0" + }, + "scrapyd": { + "hashes": [ + "sha256:7887b2b6d1d84291528cb65b6a2cce95d630e027c32330e72a5aa16710a6c3cb", + "sha256:9afff46957794618daea93046bca58576d06c1a32b290134c051235971c32a9c" + ], + "index": "pypi", + "version": "==1.2.1" + }, + "service-identity": { + "hashes": [ + "sha256:6e6c6086ca271dc11b033d17c3a8bea9f24ebff920c587da090afc9519419d34", + "sha256:f0b0caac3d40627c3c04d7a51b6e06721857a0e10a8775f2d1d7e72901b3a7db" + ], + "version": "==21.1.0" + }, + "six": { + "hashes": [ + "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", + "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.16.0" + }, + "twisted": { + "extras": [ + "http2" + ], + "hashes": [ + "sha256:77544a8945cf69b98d2946689bbe0c75de7d145cdf11f391dd487eae8fc95a12", + "sha256:aab38085ea6cda5b378b519a0ec99986874921ee8881318626b0a3414bb2631e" + ], + "markers": "python_full_version >= '3.5.4'", + "version": "==21.2.0" + }, + "w3lib": { + "hashes": [ + "sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53", + "sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df" + ], + "version": "==1.22.0" + }, + "zope.interface": { + "hashes": [ + "sha256:08f9636e99a9d5410181ba0729e0408d3d8748026ea938f3b970a0249daa8192", + "sha256:0b465ae0962d49c68aa9733ba92a001b2a0933c317780435f00be7ecb959c702", + "sha256:0cba8477e300d64a11a9789ed40ee8932b59f9ee05f85276dbb4b59acee5dd09", + "sha256:0cee5187b60ed26d56eb2960136288ce91bcf61e2a9405660d271d1f122a69a4", + "sha256:0ea1d73b7c9dcbc5080bb8aaffb776f1c68e807767069b9ccdd06f27a161914a", + "sha256:0f91b5b948686659a8e28b728ff5e74b1be6bf40cb04704453617e5f1e945ef3", + "sha256:15e7d1f7a6ee16572e21e3576d2012b2778cbacf75eb4b7400be37455f5ca8bf", + "sha256:17776ecd3a1fdd2b2cd5373e5ef8b307162f581c693575ec62e7c5399d80794c", + "sha256:194d0bcb1374ac3e1e023961610dc8f2c78a0f5f634d0c737691e215569e640d", + "sha256:1c0e316c9add0db48a5b703833881351444398b04111188069a26a61cfb4df78", + "sha256:205e40ccde0f37496904572035deea747390a8b7dc65146d30b96e2dd1359a83", + "sha256:273f158fabc5ea33cbc936da0ab3d4ba80ede5351babc4f577d768e057651531", + "sha256:2876246527c91e101184f63ccd1d716ec9c46519cc5f3d5375a3351c46467c46", + "sha256:2c98384b254b37ce50eddd55db8d381a5c53b4c10ee66e1e7fe749824f894021", + "sha256:2e5a26f16503be6c826abca904e45f1a44ff275fdb7e9d1b75c10671c26f8b94", + "sha256:334701327f37c47fa628fc8b8d28c7d7730ce7daaf4bda1efb741679c2b087fc", + "sha256:3748fac0d0f6a304e674955ab1365d515993b3a0a865e16a11ec9d86fb307f63", + "sha256:3c02411a3b62668200910090a0dff17c0b25aaa36145082a5a6adf08fa281e54", + "sha256:3dd4952748521205697bc2802e4afac5ed4b02909bb799ba1fe239f77fd4e117", + "sha256:3f24df7124c323fceb53ff6168da70dbfbae1442b4f3da439cd441681f54fe25", + "sha256:469e2407e0fe9880ac690a3666f03eb4c3c444411a5a5fddfdabc5d184a79f05", + "sha256:4de4bc9b6d35c5af65b454d3e9bc98c50eb3960d5a3762c9438df57427134b8e", + "sha256:5208ebd5152e040640518a77827bdfcc73773a15a33d6644015b763b9c9febc1", + "sha256:52de7fc6c21b419078008f697fd4103dbc763288b1406b4562554bd47514c004", + "sha256:5bb3489b4558e49ad2c5118137cfeaf59434f9737fa9c5deefc72d22c23822e2", + "sha256:5dba5f530fec3f0988d83b78cc591b58c0b6eb8431a85edd1569a0539a8a5a0e", + "sha256:5dd9ca406499444f4c8299f803d4a14edf7890ecc595c8b1c7115c2342cadc5f", + "sha256:5f931a1c21dfa7a9c573ec1f50a31135ccce84e32507c54e1ea404894c5eb96f", + "sha256:63b82bb63de7c821428d513607e84c6d97d58afd1fe2eb645030bdc185440120", + "sha256:66c0061c91b3b9cf542131148ef7ecbecb2690d48d1612ec386de9d36766058f", + "sha256:6f0c02cbb9691b7c91d5009108f975f8ffeab5dff8f26d62e21c493060eff2a1", + "sha256:71aace0c42d53abe6fc7f726c5d3b60d90f3c5c055a447950ad6ea9cec2e37d9", + "sha256:7d97a4306898b05404a0dcdc32d9709b7d8832c0c542b861d9a826301719794e", + "sha256:7df1e1c05304f26faa49fa752a8c690126cf98b40b91d54e6e9cc3b7d6ffe8b7", + "sha256:8270252effc60b9642b423189a2fe90eb6b59e87cbee54549db3f5562ff8d1b8", + "sha256:867a5ad16892bf20e6c4ea2aab1971f45645ff3102ad29bd84c86027fa99997b", + "sha256:877473e675fdcc113c138813a5dd440da0769a2d81f4d86614e5d62b69497155", + "sha256:8892f89999ffd992208754851e5a052f6b5db70a1e3f7d54b17c5211e37a98c7", + "sha256:9a9845c4c6bb56e508651f005c4aeb0404e518c6f000d5a1123ab077ab769f5c", + "sha256:a1e6e96217a0f72e2b8629e271e1b280c6fa3fe6e59fa8f6701bec14e3354325", + "sha256:a8156e6a7f5e2a0ff0c5b21d6bcb45145efece1909efcbbbf48c56f8da68221d", + "sha256:a9506a7e80bcf6eacfff7f804c0ad5350c8c95b9010e4356a4b36f5322f09abb", + "sha256:af310ec8335016b5e52cae60cda4a4f2a60a788cbb949a4fbea13d441aa5a09e", + "sha256:b0297b1e05fd128d26cc2460c810d42e205d16d76799526dfa8c8ccd50e74959", + "sha256:bf68f4b2b6683e52bec69273562df15af352e5ed25d1b6641e7efddc5951d1a7", + "sha256:d0c1bc2fa9a7285719e5678584f6b92572a5b639d0e471bb8d4b650a1a910920", + "sha256:d4d9d6c1a455d4babd320203b918ccc7fcbefe308615c521062bc2ba1aa4d26e", + "sha256:db1fa631737dab9fa0b37f3979d8d2631e348c3b4e8325d6873c2541d0ae5a48", + "sha256:dd93ea5c0c7f3e25335ab7d22a507b1dc43976e1345508f845efc573d3d779d8", + "sha256:f44e517131a98f7a76696a7b21b164bcb85291cee106a23beccce454e1f433a4", + "sha256:f7ee479e96f7ee350db1cf24afa5685a5899e2b34992fb99e1f7c1b0b758d263" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==5.4.0" + } + }, + "develop": {} +} From b67e8e1a0e43405f120469794532e4c67842947d Mon Sep 17 00:00:00 2001 From: Ramon Melo Date: Sun, 16 May 2021 19:12:19 -0300 Subject: [PATCH 2/5] Scrapy startptoject. --- scrawlinkinpark/scrapy.cfg | 11 ++ scrawlinkinpark/scrawlinkinpark/__init__.py | 0 scrawlinkinpark/scrawlinkinpark/items.py | 13 +++ .../scrawlinkinpark/middlewares.py | 103 ++++++++++++++++++ scrawlinkinpark/scrawlinkinpark/pipelines.py | 35 ++++++ scrawlinkinpark/scrawlinkinpark/settings.py | 74 +++++++++++++ .../scrawlinkinpark/spiders/__init__.py | 4 + 7 files changed, 240 insertions(+) create mode 100644 scrawlinkinpark/scrapy.cfg create mode 100644 scrawlinkinpark/scrawlinkinpark/__init__.py create mode 100644 scrawlinkinpark/scrawlinkinpark/items.py create mode 100644 scrawlinkinpark/scrawlinkinpark/middlewares.py create mode 100644 scrawlinkinpark/scrawlinkinpark/pipelines.py create mode 100644 scrawlinkinpark/scrawlinkinpark/settings.py create mode 100644 scrawlinkinpark/scrawlinkinpark/spiders/__init__.py diff --git a/scrawlinkinpark/scrapy.cfg b/scrawlinkinpark/scrapy.cfg new file mode 100644 index 0000000..899bea4 --- /dev/null +++ b/scrawlinkinpark/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = scrawlinkinpark.settings + +[deploy] +#url = http://localhost:6800/ +project = scrawlinkinpark diff --git a/scrawlinkinpark/scrawlinkinpark/__init__.py b/scrawlinkinpark/scrawlinkinpark/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrawlinkinpark/scrawlinkinpark/items.py b/scrawlinkinpark/scrawlinkinpark/items.py new file mode 100644 index 0000000..5f31886 --- /dev/null +++ b/scrawlinkinpark/scrawlinkinpark/items.py @@ -0,0 +1,13 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import attrs + + +@attr.s +class QuoteItem: + title: str = attr.ib(default='') + author: dict = attr.ib(default={'name': '', 'url': ''}) + tags: list = attr.ib(default=[]) diff --git a/scrawlinkinpark/scrawlinkinpark/middlewares.py b/scrawlinkinpark/scrawlinkinpark/middlewares.py new file mode 100644 index 0000000..b94b762 --- /dev/null +++ b/scrawlinkinpark/scrawlinkinpark/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class ScrawlinkinparkSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ScrawlinkinparkDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/scrawlinkinpark/scrawlinkinpark/pipelines.py b/scrawlinkinpark/scrawlinkinpark/pipelines.py new file mode 100644 index 0000000..062910a --- /dev/null +++ b/scrawlinkinpark/scrawlinkinpark/pipelines.py @@ -0,0 +1,35 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter +import pymongo + + +class ScrawlinkinparkPipeline: + collection_name = 'ramon_melo' + + def __init__(self, mongo_uri, mongo_db): + self.mongo_uri = mongo_uri + self.mongo_db = mongo_db + + @classmethod + def from_crawler(cls, crawler): + return cls( + mongo_uri=crawler.settings.get('MONGO_URI'), + mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') + ) + + def open_spider(self, spider): + self.client = pymongo.MongoClient(self.mongo_uri) + self.db = self.client[self.mongo_db] + + def close_spider(self, spider): + self.client.close() + + def process_item(self, item, spider): + self.db[self.collection_name].insert_one(ItemAdapter(item).asdict()) + return item diff --git a/scrawlinkinpark/scrawlinkinpark/settings.py b/scrawlinkinpark/scrawlinkinpark/settings.py new file mode 100644 index 0000000..c88e052 --- /dev/null +++ b/scrawlinkinpark/scrawlinkinpark/settings.py @@ -0,0 +1,74 @@ +# Scrapy settings for scrawlinkinpark project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'scrawlinkinpark' + +SPIDER_MODULES = ['scrawlinkinpark.spiders'] +NEWSPIDER_MODULE = 'scrawlinkinpark.spiders' + + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'scrawlinkinpark.middlewares.ScrawlinkinparkSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'scrawlinkinpark.middlewares.ScrawlinkinparkDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + diff --git a/scrawlinkinpark/scrawlinkinpark/spiders/__init__.py b/scrawlinkinpark/scrawlinkinpark/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/scrawlinkinpark/scrawlinkinpark/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. From 471f17f361787bf545b4e3e95bc1b6572cb0a966 Mon Sep 17 00:00:00 2001 From: Ramon Melo Date: Sun, 16 May 2021 19:17:23 -0300 Subject: [PATCH 3/5] Crawler itself. --- scrawlinkinpark/scrawlinkinpark/settings.py | 6 +++++ .../scrawlinkinpark/spiders/quotes_spyder.py | 25 +++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 scrawlinkinpark/scrawlinkinpark/spiders/quotes_spyder.py diff --git a/scrawlinkinpark/scrawlinkinpark/settings.py b/scrawlinkinpark/scrawlinkinpark/settings.py index c88e052..7a76840 100644 --- a/scrawlinkinpark/scrawlinkinpark/settings.py +++ b/scrawlinkinpark/scrawlinkinpark/settings.py @@ -51,6 +51,12 @@ # 'scrapy.extensions.telnet.TelnetConsole': None, #} +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'scrawlinkinpark.pipelines.ScrawlinkinparkPipeline': 300, +} + # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True diff --git a/scrawlinkinpark/scrawlinkinpark/spiders/quotes_spyder.py b/scrawlinkinpark/scrawlinkinpark/spiders/quotes_spyder.py new file mode 100644 index 0000000..b258f0a --- /dev/null +++ b/scrawlinkinpark/scrawlinkinpark/spiders/quotes_spyder.py @@ -0,0 +1,25 @@ +import scrapy + + +class QuotesSpider(scrapy.Spider): + name = 'quotes' + start_urls = [ + 'https://quotes.toscrape.com/', + ] + + + def parse(self, response): + for quote in response.css('div.quote'): + yield { + 'title': quote.css('span.text::text').get(), + 'author': { + 'name': quote.xpath('span/small/text()').get(), + 'url': 'https://quotes.toscrape.com{}'.format( + quote.xpath('span/a/@href').get()) + }, + 'tags': quote.css('div.tags a.tag::text').getall(), + } + + next_page = response.css('li.next a::attr("href")').get() + if next_page is not None: + yield response.follow(next_page, self.parse) From 8c440e3ba44b33f7a367374dcbf6999d1d709578 Mon Sep 17 00:00:00 2001 From: Ramon Melo Date: Sun, 16 May 2021 19:30:19 -0300 Subject: [PATCH 4/5] Working 100% as intended. MongoDB functional. --- scrawlinkinpark/queries.js | 20 ++++++++++++++++++++ scrawlinkinpark/scrawlinkinpark/settings.py | 14 ++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 scrawlinkinpark/queries.js diff --git a/scrawlinkinpark/queries.js b/scrawlinkinpark/queries.js new file mode 100644 index 0000000..b5b9480 --- /dev/null +++ b/scrawlinkinpark/queries.js @@ -0,0 +1,20 @@ +// Question #1: How many quotes were collected? +// NOTE: official docs recommend avoiding `.count()` 2021-05-16 18:29:28 +// https://docs.mongodb.com/manual/reference/method/db.collection.count/ +db.ramon_melo.countDocuments({}) + +// Question #2: How many distinct tags were collected? +db.ramon_melo.distinct('tags').length + +// Question #3: How many quotes per author were collected? +db.ramon_melo.aggregate([ + { + $group: { + _id: '$author.name', + qtd: { $sum: 1 } + } + }, + { + $sort: { qtd: -1 } + } +]) diff --git a/scrawlinkinpark/scrawlinkinpark/settings.py b/scrawlinkinpark/scrawlinkinpark/settings.py index 7a76840..be6148b 100644 --- a/scrawlinkinpark/scrawlinkinpark/settings.py +++ b/scrawlinkinpark/scrawlinkinpark/settings.py @@ -13,6 +13,13 @@ NEWSPIDER_MODULE = 'scrawlinkinpark.spiders' +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'scrawlinkinpark (+http://www.yourdomain.com)' + +# Obey robots.txt rules +## NOTE: adds unnecessary overhead https://doc.scrapy.org/en/1.1/news.html#id8 +ROBOTSTXT_OBEY = False + # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 @@ -24,6 +31,10 @@ #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 +# Disable cookies (enabled by default) +## NOTE: Cookies improve undesired bot detection 2021-05-16 19:18:58 +COOKIES_ENABLED = False + # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False @@ -78,3 +89,6 @@ #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +# MongoDB project settings (version 4.4.6) +MONGO_URI = "mongodb://127.0.0.1:27017" +MONGO_DATABASE = "quotestoscrape" From 37bf1e9fc57ab10f46b79e946452657e6bd73fed Mon Sep 17 00:00:00 2001 From: Ramon Melo Date: Sun, 16 May 2021 22:58:58 -0300 Subject: [PATCH 5/5] README finished. Requirements frozen. Tests OK. --- Pipfile | 1 + Pipfile.lock | 39 ++++++- requirements.txt | 46 ++++++++ scrawlinkinpark/README.md | 55 ++++++++++ scrawlinkinpark/scrawlinkinpark/settings.py | 19 +++- .../scrawlinkinpark/spiders/quotes_spyder.py | 9 +- tests/quotes.json | 102 ++++++++++++++++++ tests/scrapy_bench.out | 71 ++++++++++++ 8 files changed, 333 insertions(+), 9 deletions(-) create mode 100644 requirements.txt create mode 100644 scrawlinkinpark/README.md create mode 100644 tests/quotes.json create mode 100644 tests/scrapy_bench.out diff --git a/Pipfile b/Pipfile index 73782d2..18b30e3 100644 --- a/Pipfile +++ b/Pipfile @@ -8,6 +8,7 @@ scrapy = "*" scrapyd = "*" pymongo = "*" attrs = "*" +scrapy-fake-useragent = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 03eb33a..3da90c8 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "e367ec51f608550f043a8291a576a56108f5927961aebe8e394ce6d6608693aa" + "sha256": "c889fc3bf4bc5583ddbedaf9710ea59b2d8a38adc8d044c06512a1ee4988248e" }, "pipfile-spec": 6, "requires": { @@ -106,6 +106,20 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.1.0" }, + "fake-useragent": { + "hashes": [ + "sha256:c104998b750eb097eefc28ae28e92d66397598d2cf41a31aa45d5559ef1adf35" + ], + "version": "==0.1.11" + }, + "faker": { + "hashes": [ + "sha256:73562fb99b6046c5d26b8dd98a1437a896f8601c96382d835c656166159f4f59", + "sha256:c6a4a0a1dde71f16d489a3097661a87ae96329dbde4c3ece8a5ccc340441ade1" + ], + "markers": "python_version >= '3.6'", + "version": "==8.1.4" + }, "h2": { "hashes": [ "sha256:61e0f6601fa709f35cdb730863b4e5ec7ad449792add80d1410d4174ed139af5", @@ -376,6 +390,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==20.0.1" }, + "python-dateutil": { + "hashes": [ + "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", + "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.8.1" + }, "queuelib": { "hashes": [ "sha256:631d067c9be57e395c382d680d3653ca1452cd29e8da25c5e8d94b5c0c528c31", @@ -391,6 +413,14 @@ "index": "pypi", "version": "==2.5.0" }, + "scrapy-fake-useragent": { + "hashes": [ + "sha256:3b17e982e646918dc25080da0672812d07bfb7a92a58377c014c74e0182c665e", + "sha256:da0589d9245fe6348b491821f3be3387dd6563540146058e6b6c4f1bbe1358bf" + ], + "index": "pypi", + "version": "==1.4.4" + }, "scrapyd": { "hashes": [ "sha256:7887b2b6d1d84291528cb65b6a2cce95d630e027c32330e72a5aa16710a6c3cb", @@ -414,6 +444,13 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, + "text-unidecode": { + "hashes": [ + "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8", + "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93" + ], + "version": "==1.3" + }, "twisted": { "extras": [ "http2" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8f476b2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,46 @@ +# +# These requirements were autogenerated by pipenv +# To regenerate from the project's Pipfile, run: +# +# pipenv lock --requirements +# + +-i https://pypi.org/simple +attrs==21.2.0 +automat==20.2.0 +cffi==1.14.5 +constantly==15.1.0 +cryptography==3.4.7; python_version >= '3.6' +cssselect==1.1.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +fake-useragent==0.1.11 +faker==8.1.4; python_version >= '3.6' +h2==3.2.0 +hpack==3.0.0 +hyperframe==5.2.0 +hyperlink==21.0.0 +idna==3.1; python_version >= '3.4' +incremental==21.3.0 +itemadapter==0.2.0; python_version >= '3.6' +itemloaders==1.0.4; python_version >= '3.6' +jmespath==0.10.0; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' +lxml==4.6.3; platform_python_implementation == 'CPython' +parsel==1.6.0 +priority==1.3.0 +protego==0.1.16; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +pyasn1-modules==0.2.8 +pyasn1==0.4.8 +pycparser==2.20; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +pydispatcher==2.0.5; platform_python_implementation == 'CPython' +pymongo==3.11.4 +pyopenssl==20.0.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +python-dateutil==2.8.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +queuelib==1.6.1 +scrapy-fake-useragent==1.4.4 +scrapy==2.5.0 +scrapyd==1.2.1 +service-identity==21.1.0 +six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +text-unidecode==1.3 +twisted[http2]==21.2.0; python_full_version >= '3.5.4' +w3lib==1.22.0 +zope.interface==5.4.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' diff --git a/scrawlinkinpark/README.md b/scrawlinkinpark/README.md new file mode 100644 index 0000000..f166f1a --- /dev/null +++ b/scrawlinkinpark/README.md @@ -0,0 +1,55 @@ +>_Scrawling in my scheme_ +_These quotes, they will not yield..._ + +# Desafio Webcrawler BIT + +## Requisitos Funcionais + +1. Construir um web crawler com o _framework_ [Scrapy](https://scrapy.org/). +2. Coletar as citações na página _http://quotes.toscrape.com_. +3. Armazenar citação (string), autor (dicionário) com nome (string) e url da bio (string) e _tags_ (array) numa base de dados _MongoDB_. +4. Responder, através de _queries_ no _MongoDB_: + - Quantas citações foram coletadas? + - Quantas tags distintas foram coletadas? + - Quantas citações por autor foram coletadas? _(exemplo abaixo)_ + +![](https://github.com/b2w-atech/desafio-webcrawler/raw/master/mongodb_aggregate.png) + +## Requisitos Não-Funcionais + +1. Construir um crawler robusto, ágil e resistente a mudanças da matriz tecnológica. +2. Estruturar os componentes de forma coesa, sem lhes comprometer com acoplamento excessivo. +3. Manter uma documentação em código concisa, legível e cuidadosa. + +## Instalação + +Embora não haja um impeditivo imperativo sobre o uso da instalação global, o uso de um ambiente virtual Python é extremamente encorajado. O mecanismo mais adequado é o [Pipenv](https://pipenv.pypa.io/en/latest/install/), através do comando `pipenv shell && pipenv install`, que o criará e instalará todas as dependências (exceto o _MongoDB_) automaticamente. Também é possível usar o `pip3` através do comando `pip3 install -r `[`requirements.txt`](../requirements.txt). Apenas certifique-se de que a versão do Python é compatível. + +## Execução + +Usando a versão 2.5.0 do [Scrapy](https://scrapy.org/) sobre [Python 3.9.5](https://www.python.org/downloads/release/python-395/) e ao lado do [MongoDB 4.4.2](https://docs.mongodb.com/manual/installation/), um _pipeline_ simplificado foi construído para raspar o site de citações e armazená-las num banco de dados de nome `quotestoscrape`, sob coleção denominada `ramon_melo`. O robô armazena o texto da citação como uma string; o autor como um dicionário de chaves `name` e `url`; e as etiquetas numa lista encadeada. + +As consultas de teste estão gravadas em [`queries.js`](queries.js). Os resultados foram obtidos através do comando: + +> [`scrawlinkinpark/scrawlinkinpark/spiders`](scrawlinkinpark/spiders/quotes_spyder.py)`$ scrapy runspider quotes_spyder.py -O `[`../../../tests/quotes.json`](../tests/quotes.json) + + +É importante observar com atenção o diretório onde o programa é executado. Ele grava os resultados sequencial e similarmente no arquivo [`tests/quotes.json`](../tests/quotes.json) e no banco de dados localizado no endereço padrão `mongodb://127.0.0.1:27017`, permitindo uma comparação direta e até automatizada dos mesmos. Caso outro endereço seja utilizado, é importante modificar a variável `MONGO_URI` ao final do arquivo [`settings.py`](scrawlinkinpark/settings.py). Utilizando a ferramenta _MongoDB Shell_ e execuções sucessivas, foi observada total concordância entre ambos. + +O comando acima inclui, ainda, a possibilidade de varrer somente uma categoria do site, passando o parâmetro `-a tag=CATEGORIA`. As configurações incluem, também, customizações específicas de privacidade para o robô, já que é comum que sites tentem impedir a varredura quando notado. O programa tem a capacidade de circular entre diferentes configurações de strings de _user-agent_, cabeçalhos de navegadores comuns e, por fim, de retornar à configuração padrão. + +## Dependências + +- [Python 3.9](https://www.python.org/downloads/release/python-395/) +- [Scrapy 2.5](https://scrapy.org/) +- [MongoDB 4.4](https://docs.mongodb.com/manual/installation/) +- [PyMongo 3.11](https://pymongo.readthedocs.io/en/stable/installation.html) +- [Attrs 21.2](https://www.attrs.org/en/stable/index.html#getting-started) (melhoria de legibilidade) +- [scrapy-fake-useragent](https://github.com/alecxe/scrapy-fake-useragent) (privacidade) + +## Conclusão + +O projeto foi uma oportunidade bem aproveitada de usar um domingo frio para aquecer minhas engrenagens e praticar uma _reciclagem_. Agradeço desde já o desafio e toda a atenção dispensada. + +>_This spyder endlessly has pulled the Web upon me_ +_Distracting... Overwriting..._ diff --git a/scrawlinkinpark/scrawlinkinpark/settings.py b/scrawlinkinpark/scrawlinkinpark/settings.py index be6148b..7d12c3f 100644 --- a/scrawlinkinpark/scrawlinkinpark/settings.py +++ b/scrawlinkinpark/scrawlinkinpark/settings.py @@ -13,8 +13,13 @@ NEWSPIDER_MODULE = 'scrawlinkinpark.spiders' -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'scrawlinkinpark (+http://www.yourdomain.com)' +FAKEUSERAGENT_PROVIDERS = [ + 'scrapy_fake_useragent.providers.FakeUserAgentProvider', + 'scrapy_fake_useragent.providers.FakerProvider', + 'scrapy_fake_useragent.providers.FixedUserAgentProvider', +] +## NOTE: if all else fails, fallback to Mozilla Firefox +USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0' # Obey robots.txt rules ## NOTE: adds unnecessary overhead https://doc.scrapy.org/en/1.1/news.html#id8 @@ -33,6 +38,7 @@ # Disable cookies (enabled by default) ## NOTE: Cookies improve undesired bot detection 2021-05-16 19:18:58 +## https://docs.scrapy.org/en/latest/topics/practices.html#avoiding-getting-banned COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) @@ -52,9 +58,12 @@ # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'scrawlinkinpark.middlewares.ScrawlinkinparkDownloaderMiddleware': 543, -#} +DOWNLOADER_MIDDLEWARES = { + 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, + 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400, + 'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401, +} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html diff --git a/scrawlinkinpark/scrawlinkinpark/spiders/quotes_spyder.py b/scrawlinkinpark/scrawlinkinpark/spiders/quotes_spyder.py index b258f0a..91e1580 100644 --- a/scrawlinkinpark/scrawlinkinpark/spiders/quotes_spyder.py +++ b/scrawlinkinpark/scrawlinkinpark/spiders/quotes_spyder.py @@ -3,9 +3,12 @@ class QuotesSpider(scrapy.Spider): name = 'quotes' - start_urls = [ - 'https://quotes.toscrape.com/', - ] + + def __init__(self, tag='', **kwargs): + if tag: + tag = f'tag/{tag}' + self.start_urls = [f'https://quotes.toscrape.com/{tag}'] + super().__init__(**kwargs) def parse(self, response): diff --git a/tests/quotes.json b/tests/quotes.json new file mode 100644 index 0000000..ea7da27 --- /dev/null +++ b/tests/quotes.json @@ -0,0 +1,102 @@ +[ +{"title": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "author": {"name": "Albert Einstein", "url": "https://quotes.toscrape.com/author/Albert-Einstein"}, "tags": ["change", "deep-thoughts", "thinking", "world"]}, +{"title": "\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d", "author": {"name": "J.K. Rowling", "url": "https://quotes.toscrape.com/author/J-K-Rowling"}, "tags": ["abilities", "choices"]}, +{"title": "\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d", "author": {"name": "Albert Einstein", "url": "https://quotes.toscrape.com/author/Albert-Einstein"}, "tags": ["inspirational", "life", "live", "miracle", "miracles"]}, +{"title": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d", "author": {"name": "Jane Austen", "url": "https://quotes.toscrape.com/author/Jane-Austen"}, "tags": ["aliteracy", "books", "classic", "humor"]}, +{"title": "\u201cImperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.\u201d", "author": {"name": "Marilyn Monroe", "url": "https://quotes.toscrape.com/author/Marilyn-Monroe"}, "tags": ["be-yourself", "inspirational"]}, +{"title": "\u201cTry not to become a man of success. Rather become a man of value.\u201d", "author": {"name": "Albert Einstein", "url": "https://quotes.toscrape.com/author/Albert-Einstein"}, "tags": ["adulthood", "success", "value"]}, +{"title": "\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d", "author": {"name": "Andr\u00e9 Gide", "url": "https://quotes.toscrape.com/author/Andre-Gide"}, "tags": ["life", "love"]}, +{"title": "\u201cI have not failed. I've just found 10,000 ways that won't work.\u201d", "author": {"name": "Thomas A. Edison", "url": "https://quotes.toscrape.com/author/Thomas-A-Edison"}, "tags": ["edison", "failure", "inspirational", "paraphrased"]}, +{"title": "\u201cA woman is like a tea bag; you never know how strong it is until it's in hot water.\u201d", "author": {"name": "Eleanor Roosevelt", "url": "https://quotes.toscrape.com/author/Eleanor-Roosevelt"}, "tags": ["misattributed-eleanor-roosevelt"]}, +{"title": "\u201cA day without sunshine is like, you know, night.\u201d", "author": {"name": "Steve Martin", "url": "https://quotes.toscrape.com/author/Steve-Martin"}, "tags": ["humor", "obvious", "simile"]}, +{"title": "\u201cThis life is what you make it. No matter what, you're going to mess up sometimes, it's a universal truth. But the good part is you get to decide how you're going to mess it up. Girls will be your friends - they'll act like it anyway. But just remember, some come, some go. The ones that stay with you through everything - they're your true best friends. Don't let go of them. Also remember, sisters make the best friends in the world. As for lovers, well, they'll come and go too. And baby, I hate to say it, most of them - actually pretty much all of them are going to break your heart, but you can't give up because if you give up, you'll never find your soulmate. You'll never find that half who makes you whole and that goes for everything. Just because you fail once, doesn't mean you're gonna fail at everything. Keep trying, hold on, and always, always, always believe in yourself, because if you don't, then who will, sweetie? So keep your head high, keep your chin up, and most importantly, keep smiling, because life's a beautiful thing and there's so much to smile about.\u201d", "author": {"name": "Marilyn Monroe", "url": "https://quotes.toscrape.com/author/Marilyn-Monroe"}, "tags": ["friends", "heartbreak", "inspirational", "life", "love", "sisters"]}, +{"title": "\u201cIt takes a great deal of bravery to stand up to our enemies, but just as much to stand up to our friends.\u201d", "author": {"name": "J.K. Rowling", "url": "https://quotes.toscrape.com/author/J-K-Rowling"}, "tags": ["courage", "friends"]}, +{"title": "\u201cIf you can't explain it to a six year old, you don't understand it yourself.\u201d", "author": {"name": "Albert Einstein", "url": "https://quotes.toscrape.com/author/Albert-Einstein"}, "tags": ["simplicity", "understand"]}, +{"title": "\u201cYou may not be her first, her last, or her only. She loved before she may love again. But if she loves you now, what else matters? She's not perfect\u2014you aren't either, and the two of you may never be perfect together but if she can make you laugh, cause you to think twice, and admit to being human and making mistakes, hold onto her and give her the most you can. She may not be thinking about you every second of the day, but she will give you a part of her that she knows you can break\u2014her heart. So don't hurt her, don't change her, don't analyze and don't expect more than she can give. Smile when she makes you happy, let her know when she makes you mad, and miss her when she's not there.\u201d", "author": {"name": "Bob Marley", "url": "https://quotes.toscrape.com/author/Bob-Marley"}, "tags": ["love"]}, +{"title": "\u201cI like nonsense, it wakes up the brain cells. Fantasy is a necessary ingredient in living.\u201d", "author": {"name": "Dr. Seuss", "url": "https://quotes.toscrape.com/author/Dr-Seuss"}, "tags": ["fantasy"]}, +{"title": "\u201cI may not have gone where I intended to go, but I think I have ended up where I needed to be.\u201d", "author": {"name": "Douglas Adams", "url": "https://quotes.toscrape.com/author/Douglas-Adams"}, "tags": ["life", "navigation"]}, +{"title": "\u201cThe opposite of love is not hate, it's indifference. The opposite of art is not ugliness, it's indifference. The opposite of faith is not heresy, it's indifference. And the opposite of life is not death, it's indifference.\u201d", "author": {"name": "Elie Wiesel", "url": "https://quotes.toscrape.com/author/Elie-Wiesel"}, "tags": ["activism", "apathy", "hate", "indifference", "inspirational", "love", "opposite", "philosophy"]}, +{"title": "\u201cIt is not a lack of love, but a lack of friendship that makes unhappy marriages.\u201d", "author": {"name": "Friedrich Nietzsche", "url": "https://quotes.toscrape.com/author/Friedrich-Nietzsche"}, "tags": ["friendship", "lack-of-friendship", "lack-of-love", "love", "marriage", "unhappy-marriage"]}, +{"title": "\u201cGood friends, good books, and a sleepy conscience: this is the ideal life.\u201d", "author": {"name": "Mark Twain", "url": "https://quotes.toscrape.com/author/Mark-Twain"}, "tags": ["books", "contentment", "friends", "friendship", "life"]}, +{"title": "\u201cLife is what happens to us while we are making other plans.\u201d", "author": {"name": "Allen Saunders", "url": "https://quotes.toscrape.com/author/Allen-Saunders"}, "tags": ["fate", "life", "misattributed-john-lennon", "planning", "plans"]}, +{"title": "\u201cI love you without knowing how, or when, or from where. I love you simply, without problems or pride: I love you in this way because I do not know any other way of loving but this, in which there is no I or you, so intimate that your hand upon my chest is my hand, so intimate that when I fall asleep your eyes close.\u201d", "author": {"name": "Pablo Neruda", "url": "https://quotes.toscrape.com/author/Pablo-Neruda"}, "tags": ["love", "poetry"]}, +{"title": "\u201cFor every minute you are angry you lose sixty seconds of happiness.\u201d", "author": {"name": "Ralph Waldo Emerson", "url": "https://quotes.toscrape.com/author/Ralph-Waldo-Emerson"}, "tags": ["happiness"]}, +{"title": "\u201cIf you judge people, you have no time to love them.\u201d", "author": {"name": "Mother Teresa", "url": "https://quotes.toscrape.com/author/Mother-Teresa"}, "tags": ["attributed-no-source"]}, +{"title": "\u201cAnyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.\u201d", "author": {"name": "Garrison Keillor", "url": "https://quotes.toscrape.com/author/Garrison-Keillor"}, "tags": ["humor", "religion"]}, +{"title": "\u201cBeauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.\u201d", "author": {"name": "Jim Henson", "url": "https://quotes.toscrape.com/author/Jim-Henson"}, "tags": ["humor"]}, +{"title": "\u201cToday you are You, that is truer than true. There is no one alive who is Youer than You.\u201d", "author": {"name": "Dr. Seuss", "url": "https://quotes.toscrape.com/author/Dr-Seuss"}, "tags": ["comedy", "life", "yourself"]}, +{"title": "\u201cIf you want your children to be intelligent, read them fairy tales. If you want them to be more intelligent, read them more fairy tales.\u201d", "author": {"name": "Albert Einstein", "url": "https://quotes.toscrape.com/author/Albert-Einstein"}, "tags": ["children", "fairy-tales"]}, +{"title": "\u201cIt is impossible to live without failing at something, unless you live so cautiously that you might as well not have lived at all - in which case, you fail by default.\u201d", "author": {"name": "J.K. Rowling", "url": "https://quotes.toscrape.com/author/J-K-Rowling"}, "tags": []}, +{"title": "\u201cLogic will get you from A to Z; imagination will get you everywhere.\u201d", "author": {"name": "Albert Einstein", "url": "https://quotes.toscrape.com/author/Albert-Einstein"}, "tags": ["imagination"]}, +{"title": "\u201cOne good thing about music, when it hits you, you feel no pain.\u201d", "author": {"name": "Bob Marley", "url": "https://quotes.toscrape.com/author/Bob-Marley"}, "tags": ["music"]}, +{"title": "\u201cThe more that you read, the more things you will know. The more that you learn, the more places you'll go.\u201d", "author": {"name": "Dr. Seuss", "url": "https://quotes.toscrape.com/author/Dr-Seuss"}, "tags": ["learning", "reading", "seuss"]}, +{"title": "\u201cOf course it is happening inside your head, Harry, but why on earth should that mean that it is not real?\u201d", "author": {"name": "J.K. Rowling", "url": "https://quotes.toscrape.com/author/J-K-Rowling"}, "tags": ["dumbledore"]}, +{"title": "\u201cThe truth is, everyone is going to hurt you. You just got to find the ones worth suffering for.\u201d", "author": {"name": "Bob Marley", "url": "https://quotes.toscrape.com/author/Bob-Marley"}, "tags": ["friendship"]}, +{"title": "\u201cNot all of us can do great things. But we can do small things with great love.\u201d", "author": {"name": "Mother Teresa", "url": "https://quotes.toscrape.com/author/Mother-Teresa"}, "tags": ["misattributed-to-mother-teresa", "paraphrased"]}, +{"title": "\u201cTo the well-organized mind, death is but the next great adventure.\u201d", "author": {"name": "J.K. Rowling", "url": "https://quotes.toscrape.com/author/J-K-Rowling"}, "tags": ["death", "inspirational"]}, +{"title": "\u201cAll you need is love. But a little chocolate now and then doesn't hurt.\u201d", "author": {"name": "Charles M. Schulz", "url": "https://quotes.toscrape.com/author/Charles-M-Schulz"}, "tags": ["chocolate", "food", "humor"]}, +{"title": "\u201cWe read to know we're not alone.\u201d", "author": {"name": "William Nicholson", "url": "https://quotes.toscrape.com/author/William-Nicholson"}, "tags": ["misattributed-to-c-s-lewis", "reading"]}, +{"title": "\u201cAny fool can know. The point is to understand.\u201d", "author": {"name": "Albert Einstein", "url": "https://quotes.toscrape.com/author/Albert-Einstein"}, "tags": ["knowledge", "learning", "understanding", "wisdom"]}, +{"title": "\u201cI have always imagined that Paradise will be a kind of library.\u201d", "author": {"name": "Jorge Luis Borges", "url": "https://quotes.toscrape.com/author/Jorge-Luis-Borges"}, "tags": ["books", "library"]}, +{"title": "\u201cIt is never too late to be what you might have been.\u201d", "author": {"name": "George Eliot", "url": "https://quotes.toscrape.com/author/George-Eliot"}, "tags": ["inspirational"]}, +{"title": "\u201cA reader lives a thousand lives before he dies, said Jojen. The man who never reads lives only one.\u201d", "author": {"name": "George R.R. Martin", "url": "https://quotes.toscrape.com/author/George-R-R-Martin"}, "tags": ["read", "readers", "reading", "reading-books"]}, +{"title": "\u201cYou can never get a cup of tea large enough or a book long enough to suit me.\u201d", "author": {"name": "C.S. Lewis", "url": "https://quotes.toscrape.com/author/C-S-Lewis"}, "tags": ["books", "inspirational", "reading", "tea"]}, +{"title": "\u201cYou believe lies so you eventually learn to trust no one but yourself.\u201d", "author": {"name": "Marilyn Monroe", "url": "https://quotes.toscrape.com/author/Marilyn-Monroe"}, "tags": []}, +{"title": "\u201cIf you can make a woman laugh, you can make her do anything.\u201d", "author": {"name": "Marilyn Monroe", "url": "https://quotes.toscrape.com/author/Marilyn-Monroe"}, "tags": ["girls", "love"]}, +{"title": "\u201cLife is like riding a bicycle. To keep your balance, you must keep moving.\u201d", "author": {"name": "Albert Einstein", "url": "https://quotes.toscrape.com/author/Albert-Einstein"}, "tags": ["life", "simile"]}, +{"title": "\u201cThe real lover is the man who can thrill you by kissing your forehead or smiling into your eyes or just staring into space.\u201d", "author": {"name": "Marilyn Monroe", "url": "https://quotes.toscrape.com/author/Marilyn-Monroe"}, "tags": ["love"]}, +{"title": "\u201cA wise girl kisses but doesn't love, listens but doesn't believe, and leaves before she is left.\u201d", "author": {"name": "Marilyn Monroe", "url": "https://quotes.toscrape.com/author/Marilyn-Monroe"}, "tags": ["attributed-no-source"]}, +{"title": "\u201cOnly in the darkness can you see the stars.\u201d", "author": {"name": "Martin Luther King Jr.", "url": "https://quotes.toscrape.com/author/Martin-Luther-King-Jr"}, "tags": ["hope", "inspirational"]}, +{"title": "\u201cIt matters not what someone is born, but what they grow to be.\u201d", "author": {"name": "J.K. Rowling", "url": "https://quotes.toscrape.com/author/J-K-Rowling"}, "tags": ["dumbledore"]}, +{"title": "\u201cLove does not begin and end the way we seem to think it does. Love is a battle, love is a war; love is a growing up.\u201d", "author": {"name": "James Baldwin", "url": "https://quotes.toscrape.com/author/James-Baldwin"}, "tags": ["love"]}, +{"title": "\u201cThere is nothing I would not do for those who are really my friends. I have no notion of loving people by halves, it is not my nature.\u201d", "author": {"name": "Jane Austen", "url": "https://quotes.toscrape.com/author/Jane-Austen"}, "tags": ["friendship", "love"]}, +{"title": "\u201cDo one thing every day that scares you.\u201d", "author": {"name": "Eleanor Roosevelt", "url": "https://quotes.toscrape.com/author/Eleanor-Roosevelt"}, "tags": ["attributed", "fear", "inspiration"]}, +{"title": "\u201cI am good, but not an angel. I do sin, but I am not the devil. I am just a small girl in a big world trying to find someone to love.\u201d", "author": {"name": "Marilyn Monroe", "url": "https://quotes.toscrape.com/author/Marilyn-Monroe"}, "tags": ["attributed-no-source"]}, +{"title": "\u201cIf I were not a physicist, I would probably be a musician. I often think in music. I live my daydreams in music. I see my life in terms of music.\u201d", "author": {"name": "Albert Einstein", "url": "https://quotes.toscrape.com/author/Albert-Einstein"}, "tags": ["music"]}, +{"title": "\u201cIf you only read the books that everyone else is reading, you can only think what everyone else is thinking.\u201d", "author": {"name": "Haruki Murakami", "url": "https://quotes.toscrape.com/author/Haruki-Murakami"}, "tags": ["books", "thought"]}, +{"title": "\u201cThe difference between genius and stupidity is: genius has its limits.\u201d", "author": {"name": "Alexandre Dumas fils", "url": "https://quotes.toscrape.com/author/Alexandre-Dumas-fils"}, "tags": ["misattributed-to-einstein"]}, +{"title": "\u201cHe's like a drug for you, Bella.\u201d", "author": {"name": "Stephenie Meyer", "url": "https://quotes.toscrape.com/author/Stephenie-Meyer"}, "tags": ["drug", "romance", "simile"]}, +{"title": "\u201cThere is no friend as loyal as a book.\u201d", "author": {"name": "Ernest Hemingway", "url": "https://quotes.toscrape.com/author/Ernest-Hemingway"}, "tags": ["books", "friends", "novelist-quotes"]}, +{"title": "\u201cWhen one door of happiness closes, another opens; but often we look so long at the closed door that we do not see the one which has been opened for us.\u201d", "author": {"name": "Helen Keller", "url": "https://quotes.toscrape.com/author/Helen-Keller"}, "tags": ["inspirational"]}, +{"title": "\u201cLife isn't about finding yourself. Life is about creating yourself.\u201d", "author": {"name": "George Bernard Shaw", "url": "https://quotes.toscrape.com/author/George-Bernard-Shaw"}, "tags": ["inspirational", "life", "yourself"]}, +{"title": "\u201cThat's the problem with drinking, I thought, as I poured myself a drink. If something bad happens you drink in an attempt to forget; if something good happens you drink in order to celebrate; and if nothing happens you drink to make something happen.\u201d", "author": {"name": "Charles Bukowski", "url": "https://quotes.toscrape.com/author/Charles-Bukowski"}, "tags": ["alcohol"]}, +{"title": "\u201cYou don\u2019t forget the face of the person who was your last hope.\u201d", "author": {"name": "Suzanne Collins", "url": "https://quotes.toscrape.com/author/Suzanne-Collins"}, "tags": ["the-hunger-games"]}, +{"title": "\u201cRemember, we're madly in love, so it's all right to kiss me anytime you feel like it.\u201d", "author": {"name": "Suzanne Collins", "url": "https://quotes.toscrape.com/author/Suzanne-Collins"}, "tags": ["humor"]}, +{"title": "\u201cTo love at all is to be vulnerable. Love anything and your heart will be wrung and possibly broken. If you want to make sure of keeping it intact you must give it to no one, not even an animal. Wrap it carefully round with hobbies and little luxuries; avoid all entanglements. Lock it up safe in the casket or coffin of your selfishness. But in that casket, safe, dark, motionless, airless, it will change. It will not be broken; it will become unbreakable, impenetrable, irredeemable. To love is to be vulnerable.\u201d", "author": {"name": "C.S. Lewis", "url": "https://quotes.toscrape.com/author/C-S-Lewis"}, "tags": ["love"]}, +{"title": "\u201cNot all those who wander are lost.\u201d", "author": {"name": "J.R.R. Tolkien", "url": "https://quotes.toscrape.com/author/J-R-R-Tolkien"}, "tags": ["bilbo", "journey", "lost", "quest", "travel", "wander"]}, +{"title": "\u201cDo not pity the dead, Harry. Pity the living, and, above all those who live without love.\u201d", "author": {"name": "J.K. Rowling", "url": "https://quotes.toscrape.com/author/J-K-Rowling"}, "tags": ["live-death-love"]}, +{"title": "\u201cThere is nothing to writing. All you do is sit down at a typewriter and bleed.\u201d", "author": {"name": "Ernest Hemingway", "url": "https://quotes.toscrape.com/author/Ernest-Hemingway"}, "tags": ["good", "writing"]}, +{"title": "\u201cFinish each day and be done with it. You have done what you could. Some blunders and absurdities no doubt crept in; forget them as soon as you can. Tomorrow is a new day. You shall begin it serenely and with too high a spirit to be encumbered with your old nonsense.\u201d", "author": {"name": "Ralph Waldo Emerson", "url": "https://quotes.toscrape.com/author/Ralph-Waldo-Emerson"}, "tags": ["life", "regrets"]}, +{"title": "\u201cI have never let my schooling interfere with my education.\u201d", "author": {"name": "Mark Twain", "url": "https://quotes.toscrape.com/author/Mark-Twain"}, "tags": ["education"]}, +{"title": "\u201cI have heard there are troubles of more than one kind. Some come from ahead and some come from behind. But I've bought a big bat. I'm all ready you see. Now my troubles are going to have troubles with me!\u201d", "author": {"name": "Dr. Seuss", "url": "https://quotes.toscrape.com/author/Dr-Seuss"}, "tags": ["troubles"]}, +{"title": "\u201cIf I had a flower for every time I thought of you...I could walk through my garden forever.\u201d", "author": {"name": "Alfred Tennyson", "url": "https://quotes.toscrape.com/author/Alfred-Tennyson"}, "tags": ["friendship", "love"]}, +{"title": "\u201cSome people never go crazy. What truly horrible lives they must lead.\u201d", "author": {"name": "Charles Bukowski", "url": "https://quotes.toscrape.com/author/Charles-Bukowski"}, "tags": ["humor"]}, +{"title": "\u201cThe trouble with having an open mind, of course, is that people will insist on coming along and trying to put things in it.\u201d", "author": {"name": "Terry Pratchett", "url": "https://quotes.toscrape.com/author/Terry-Pratchett"}, "tags": ["humor", "open-mind", "thinking"]}, +{"title": "\u201cThink left and think right and think low and think high. Oh, the thinks you can think up if only you try!\u201d", "author": {"name": "Dr. Seuss", "url": "https://quotes.toscrape.com/author/Dr-Seuss"}, "tags": ["humor", "philosophy"]}, +{"title": "\u201cWhat really knocks me out is a book that, when you're all done reading it, you wish the author that wrote it was a terrific friend of yours and you could call him up on the phone whenever you felt like it. That doesn't happen much, though.\u201d", "author": {"name": "J.D. Salinger", "url": "https://quotes.toscrape.com/author/J-D-Salinger"}, "tags": ["authors", "books", "literature", "reading", "writing"]}, +{"title": "\u201cThe reason I talk to myself is because I\u2019m the only one whose answers I accept.\u201d", "author": {"name": "George Carlin", "url": "https://quotes.toscrape.com/author/George-Carlin"}, "tags": ["humor", "insanity", "lies", "lying", "self-indulgence", "truth"]}, +{"title": "\u201cYou may say I'm a dreamer, but I'm not the only one. I hope someday you'll join us. And the world will live as one.\u201d", "author": {"name": "John Lennon", "url": "https://quotes.toscrape.com/author/John-Lennon"}, "tags": ["beatles", "connection", "dreamers", "dreaming", "dreams", "hope", "inspirational", "peace"]}, +{"title": "\u201cI am free of all prejudice. I hate everyone equally. \u201d", "author": {"name": "W.C. Fields", "url": "https://quotes.toscrape.com/author/W-C-Fields"}, "tags": ["humor", "sinister"]}, +{"title": "\u201cThe question isn't who is going to let me; it's who is going to stop me.\u201d", "author": {"name": "Ayn Rand", "url": "https://quotes.toscrape.com/author/Ayn-Rand"}, "tags": []}, +{"title": "\u201c\u2032Classic\u2032 - a book which people praise and don't read.\u201d", "author": {"name": "Mark Twain", "url": "https://quotes.toscrape.com/author/Mark-Twain"}, "tags": ["books", "classic", "reading"]}, +{"title": "\u201cAnyone who has never made a mistake has never tried anything new.\u201d", "author": {"name": "Albert Einstein", "url": "https://quotes.toscrape.com/author/Albert-Einstein"}, "tags": ["mistakes"]}, +{"title": "\u201cA lady's imagination is very rapid; it jumps from admiration to love, from love to matrimony in a moment.\u201d", "author": {"name": "Jane Austen", "url": "https://quotes.toscrape.com/author/Jane-Austen"}, "tags": ["humor", "love", "romantic", "women"]}, +{"title": "\u201cRemember, if the time should come when you have to make a choice between what is right and what is easy, remember what happened to a boy who was good, and kind, and brave, because he strayed across the path of Lord Voldemort. Remember Cedric Diggory.\u201d", "author": {"name": "J.K. Rowling", "url": "https://quotes.toscrape.com/author/J-K-Rowling"}, "tags": ["integrity"]}, +{"title": "\u201cI declare after all there is no enjoyment like reading! How much sooner one tires of any thing than of a book! -- When I have a house of my own, I shall be miserable if I have not an excellent library.\u201d", "author": {"name": "Jane Austen", "url": "https://quotes.toscrape.com/author/Jane-Austen"}, "tags": ["books", "library", "reading"]}, +{"title": "\u201cThere are few people whom I really love, and still fewer of whom I think well. The more I see of the world, the more am I dissatisfied with it; and every day confirms my belief of the inconsistency of all human characters, and of the little dependence that can be placed on the appearance of merit or sense.\u201d", "author": {"name": "Jane Austen", "url": "https://quotes.toscrape.com/author/Jane-Austen"}, "tags": ["elizabeth-bennet", "jane-austen"]}, +{"title": "\u201cSome day you will be old enough to start reading fairy tales again.\u201d", "author": {"name": "C.S. Lewis", "url": "https://quotes.toscrape.com/author/C-S-Lewis"}, "tags": ["age", "fairytales", "growing-up"]}, +{"title": "\u201cWe are not necessarily doubting that God will do the best for us; we are wondering how painful the best will turn out to be.\u201d", "author": {"name": "C.S. Lewis", "url": "https://quotes.toscrape.com/author/C-S-Lewis"}, "tags": ["god"]}, +{"title": "\u201cThe fear of death follows from the fear of life. A man who lives fully is prepared to die at any time.\u201d", "author": {"name": "Mark Twain", "url": "https://quotes.toscrape.com/author/Mark-Twain"}, "tags": ["death", "life"]}, +{"title": "\u201cA lie can travel half way around the world while the truth is putting on its shoes.\u201d", "author": {"name": "Mark Twain", "url": "https://quotes.toscrape.com/author/Mark-Twain"}, "tags": ["misattributed-mark-twain", "truth"]}, +{"title": "\u201cI believe in Christianity as I believe that the sun has risen: not only because I see it, but because by it I see everything else.\u201d", "author": {"name": "C.S. Lewis", "url": "https://quotes.toscrape.com/author/C-S-Lewis"}, "tags": ["christianity", "faith", "religion", "sun"]}, +{"title": "\u201cThe truth.\" Dumbledore sighed. \"It is a beautiful and terrible thing, and should therefore be treated with great caution.\u201d", "author": {"name": "J.K. Rowling", "url": "https://quotes.toscrape.com/author/J-K-Rowling"}, "tags": ["truth"]}, +{"title": "\u201cI'm the one that's got to die when it's time for me to die, so let me live my life the way I want to.\u201d", "author": {"name": "Jimi Hendrix", "url": "https://quotes.toscrape.com/author/Jimi-Hendrix"}, "tags": ["death", "life"]}, +{"title": "\u201cTo die will be an awfully big adventure.\u201d", "author": {"name": "J.M. Barrie", "url": "https://quotes.toscrape.com/author/J-M-Barrie"}, "tags": ["adventure", "love"]}, +{"title": "\u201cIt takes courage to grow up and become who you really are.\u201d", "author": {"name": "E.E. Cummings", "url": "https://quotes.toscrape.com/author/E-E-Cummings"}, "tags": ["courage"]}, +{"title": "\u201cBut better to get hurt by the truth than comforted with a lie.\u201d", "author": {"name": "Khaled Hosseini", "url": "https://quotes.toscrape.com/author/Khaled-Hosseini"}, "tags": ["life"]}, +{"title": "\u201cYou never really understand a person until you consider things from his point of view... Until you climb inside of his skin and walk around in it.\u201d", "author": {"name": "Harper Lee", "url": "https://quotes.toscrape.com/author/Harper-Lee"}, "tags": ["better-life-empathy"]}, +{"title": "\u201cYou have to write the book that wants to be written. And if the book will be too difficult for grown-ups, then you write it for children.\u201d", "author": {"name": "Madeleine L'Engle", "url": "https://quotes.toscrape.com/author/Madeleine-LEngle"}, "tags": ["books", "children", "difficult", "grown-ups", "write", "writers", "writing"]}, +{"title": "\u201cNever tell the truth to people who are not worthy of it.\u201d", "author": {"name": "Mark Twain", "url": "https://quotes.toscrape.com/author/Mark-Twain"}, "tags": ["truth"]}, +{"title": "\u201cA person's a person, no matter how small.\u201d", "author": {"name": "Dr. Seuss", "url": "https://quotes.toscrape.com/author/Dr-Seuss"}, "tags": ["inspirational"]}, +{"title": "\u201c... a mind needs books as a sword needs a whetstone, if it is to keep its edge.\u201d", "author": {"name": "George R.R. Martin", "url": "https://quotes.toscrape.com/author/George-R-R-Martin"}, "tags": ["books", "mind"]} +] \ No newline at end of file diff --git a/tests/scrapy_bench.out b/tests/scrapy_bench.out new file mode 100644 index 0000000..009536e --- /dev/null +++ b/tests/scrapy_bench.out @@ -0,0 +1,71 @@ +2021-05-16 19:51:58 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: scrawlinkinpark) +2021-05-16 19:51:58 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.9.5 (default, May 5 2021, 02:58:34) - [GCC 7.5.0], pyOpenSSL 20.0.1 (OpenSSL 1.1.1k 25 Mar 2021), cryptography 3.4.7, Platform Linux-5.4.0-73-generic-x86_64-with-glibc2.28 +2021-05-16 19:51:58 [scrapy.crawler] INFO: Overridden settings: +{'BOT_NAME': 'scrawlinkinpark', + 'CLOSESPIDER_TIMEOUT': 10, + 'COOKIES_ENABLED': False, + 'LOGSTATS_INTERVAL': 1, + 'LOG_LEVEL': 'INFO', + 'NEWSPIDER_MODULE': 'scrawlinkinpark.spiders', + 'SPIDER_MODULES': ['scrawlinkinpark.spiders']} +2021-05-16 19:51:58 [scrapy.extensions.telnet] INFO: Telnet Password: c672a56d61a687b9 +2021-05-16 19:51:58 [scrapy.middleware] INFO: Enabled extensions: +['scrapy.extensions.corestats.CoreStats', + 'scrapy.extensions.telnet.TelnetConsole', + 'scrapy.extensions.memusage.MemoryUsage', + 'scrapy.extensions.closespider.CloseSpider', + 'scrapy.extensions.logstats.LogStats'] +2021-05-16 19:51:58 [scrapy.middleware] INFO: Enabled downloader middlewares: +['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', + 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', + 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', + 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', + 'scrapy.downloadermiddlewares.retry.RetryMiddleware', + 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', + 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', + 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware', + 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware', + 'scrapy.downloadermiddlewares.stats.DownloaderStats'] +2021-05-16 19:51:58 [scrapy.middleware] INFO: Enabled spider middlewares: +['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', + 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', + 'scrapy.spidermiddlewares.referer.RefererMiddleware', + 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', + 'scrapy.spidermiddlewares.depth.DepthMiddleware'] +2021-05-16 19:51:58 [scrapy.middleware] INFO: Enabled item pipelines: +['scrawlinkinpark.pipelines.ScrawlinkinparkPipeline'] +2021-05-16 19:51:58 [scrapy.core.engine] INFO: Spider opened +2021-05-16 19:51:58 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) +2021-05-16 19:51:58 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023 +2021-05-16 19:51:59 [scrapy.extensions.logstats] INFO: Crawled 78 pages (at 4680 pages/min), scraped 0 items (at 0 items/min) +2021-05-16 19:52:00 [scrapy.extensions.logstats] INFO: Crawled 142 pages (at 3840 pages/min), scraped 0 items (at 0 items/min) +2021-05-16 19:52:01 [scrapy.extensions.logstats] INFO: Crawled 214 pages (at 4320 pages/min), scraped 0 items (at 0 items/min) +2021-05-16 19:52:02 [scrapy.extensions.logstats] INFO: Crawled 278 pages (at 3840 pages/min), scraped 0 items (at 0 items/min) +2021-05-16 19:52:03 [scrapy.extensions.logstats] INFO: Crawled 334 pages (at 3360 pages/min), scraped 0 items (at 0 items/min) +2021-05-16 19:52:04 [scrapy.extensions.logstats] INFO: Crawled 390 pages (at 3360 pages/min), scraped 0 items (at 0 items/min) +2021-05-16 19:52:05 [scrapy.extensions.logstats] INFO: Crawled 438 pages (at 2880 pages/min), scraped 0 items (at 0 items/min) +2021-05-16 19:52:06 [scrapy.extensions.logstats] INFO: Crawled 486 pages (at 2880 pages/min), scraped 0 items (at 0 items/min) +2021-05-16 19:52:07 [scrapy.extensions.logstats] INFO: Crawled 534 pages (at 2880 pages/min), scraped 0 items (at 0 items/min) +2021-05-16 19:52:08 [scrapy.core.engine] INFO: Closing spider (closespider_timeout) +2021-05-16 19:52:08 [scrapy.extensions.logstats] INFO: Crawled 582 pages (at 2880 pages/min), scraped 0 items (at 0 items/min) +2021-05-16 19:52:09 [scrapy.statscollectors] INFO: Dumping Scrapy stats: +{'downloader/request_bytes': 270373, + 'downloader/request_count': 598, + 'downloader/request_method_count/GET': 598, + 'downloader/response_bytes': 1869504, + 'downloader/response_count': 598, + 'downloader/response_status_count/200': 598, + 'elapsed_time_seconds': 10.604241, + 'finish_reason': 'closespider_timeout', + 'finish_time': datetime.datetime(2021, 5, 16, 22, 52, 9, 404354), + 'log_count/INFO': 20, + 'memusage/max': 63811584, + 'memusage/startup': 63811584, + 'request_depth_max': 21, + 'response_received_count': 598, + 'scheduler/dequeued': 598, + 'scheduler/dequeued/memory': 598, + 'scheduler/enqueued': 11959, + 'scheduler/enqueued/memory': 11959, + 'start_time': datetime.datetime(2021, 5, 16, 22, 51, 58, 800113)} +2021-05-16 19:52:09 [scrapy.core.engine] INFO: Spider closed (closespider_timeout)