From 4e46b5b29cf684ef8a4e35358cf800012b2a5bb9 Mon Sep 17 00:00:00 2001 From: Nathan Easton Date: Fri, 9 Dec 2016 11:57:03 -0500 Subject: [PATCH 1/5] Fixed BeautifulSoup parse issue --- PyLyrics/functions.py | 6 +- .../lib.linux-x86_64-2.7/PyLyrics/__init__.py | 14 +++ .../lib.linux-x86_64-2.7/PyLyrics/classes.py | 5 + .../PyLyrics/functions.py | 110 ++++++++++++++++++ build/lib.linux-x86_64-2.7/PyLyrics/tests.py | 22 ++++ 5 files changed, 154 insertions(+), 3 deletions(-) create mode 100644 build/lib.linux-x86_64-2.7/PyLyrics/__init__.py create mode 100644 build/lib.linux-x86_64-2.7/PyLyrics/classes.py create mode 100644 build/lib.linux-x86_64-2.7/PyLyrics/functions.py create mode 100644 build/lib.linux-x86_64-2.7/PyLyrics/tests.py diff --git a/PyLyrics/functions.py b/PyLyrics/functions.py index e4c7e19..5bd1336 100644 --- a/PyLyrics/functions.py +++ b/PyLyrics/functions.py @@ -41,7 +41,7 @@ class PyLyrics: @staticmethod def getAlbums(singer): singer = singer.replace(' ', '_') - s = BeautifulSoup(requests.get('http://lyrics.wikia.com/{0}'.format(singer)).text) + s = BeautifulSoup(requests.get('http://lyrics.wikia.com/{0}'.format(singer)).text,"lxml") spans = s.findAll('span',{'class':'mw-headline'}) als = [] @@ -60,7 +60,7 @@ def getAlbums(singer): @staticmethod def getTracks(album): url = "http://lyrics.wikia.com/api.php?action=lyrics&artist={0}&fmt=xml".format(album.artist()) - soup = BeautifulSoup(requests.get(url).text) + soup = BeautifulSoup(requests.get(url).text,"lxml") for al in soup.find_all('album'): if al.text.lower().strip() == album.name.strip().lower(): @@ -75,7 +75,7 @@ def getLyrics(singer, song): singer = singer.replace(' ', '_') song = song.replace(' ', '_') r = requests.get('http://lyrics.wikia.com/{0}:{1}'.format(singer,song)) - s = BeautifulSoup(r.text) + s = BeautifulSoup(r.text,"lxml") #Get main lyrics holder lyrics = s.find("div",{'class':'lyricbox'}) if lyrics is None: diff --git a/build/lib.linux-x86_64-2.7/PyLyrics/__init__.py b/build/lib.linux-x86_64-2.7/PyLyrics/__init__.py new file mode 100644 index 0000000..6e85576 --- /dev/null +++ b/build/lib.linux-x86_64-2.7/PyLyrics/__init__.py @@ -0,0 +1,14 @@ +__author__ = "Pradipta" +__version__ = '1.0.0' + +try: + #Python 3 Imports + from .classes import * + from .functions import * +except: + #Python 2 imports + from classes import * + from functions import * + + + \ No newline at end of file diff --git a/build/lib.linux-x86_64-2.7/PyLyrics/classes.py b/build/lib.linux-x86_64-2.7/PyLyrics/classes.py new file mode 100644 index 0000000..a19e17f --- /dev/null +++ b/build/lib.linux-x86_64-2.7/PyLyrics/classes.py @@ -0,0 +1,5 @@ +#Classes for Scrapers +try: + from .functions import * +except: + from functions import * diff --git a/build/lib.linux-x86_64-2.7/PyLyrics/functions.py b/build/lib.linux-x86_64-2.7/PyLyrics/functions.py new file mode 100644 index 0000000..5bd1336 --- /dev/null +++ b/build/lib.linux-x86_64-2.7/PyLyrics/functions.py @@ -0,0 +1,110 @@ +import requests +from bs4 import BeautifulSoup, Comment, NavigableString +import sys, codecs, json + +class Track(object): + def __init__(self,trackName,album,artist): + self.name = trackName + self.album = album + self.artist = artist + def __repr__(self): + return self.name + def link(self): + return 'http://lyrics.wikia.com/{0}:{1}'.format(self.artist.replace(' ', '-'),self.name.replace(' ','-')) + def getLyrics(self): + return PyLyrics.getLyrics(self.artist,self.name) +class Artist(object): + def __init__(self, name): + self.name = name + def getAlbums(self): + return PyLyrics.getAlbums(self.name) + def __repr__(self): + return self.name.encode('utf-8') +class Album(object): + def __init__(self, name, link,singer): + self.year = name.split(' ')[-1] + self.name = name.replace(self.year,' ').rstrip() + self.url = link + self.singer = singer + def link(self): + return self.url + def __repr__(self): + if sys.version_info[0] == 2: + return self.name.encode('utf-8','replace') + return self.name + def artist(self): + return self.singer + def tracks(self): + return PyLyrics.getTracks(self) + +class PyLyrics: + @staticmethod + def getAlbums(singer): + singer = singer.replace(' ', '_') + s = BeautifulSoup(requests.get('http://lyrics.wikia.com/{0}'.format(singer)).text,"lxml") + spans = s.findAll('span',{'class':'mw-headline'}) + + als = [] + + for tag in spans: + try: + a = tag.findAll('a')[0] + als.append(Album(a.text,'http://lyrics.wikia.com' + a['href'],singer)) + except: + pass + + if als == []: + raise ValueError("Unknown Artist Name given") + return None + return als + @staticmethod + def getTracks(album): + url = "http://lyrics.wikia.com/api.php?action=lyrics&artist={0}&fmt=xml".format(album.artist()) + soup = BeautifulSoup(requests.get(url).text,"lxml") + + for al in soup.find_all('album'): + if al.text.lower().strip() == album.name.strip().lower(): + currentAlbum = al + break + songs =[Track(song.text,album,album.artist()) for song in currentAlbum.findNext('songs').findAll('item')] + return songs + + @staticmethod + def getLyrics(singer, song): + #Replace spaces with _ + singer = singer.replace(' ', '_') + song = song.replace(' ', '_') + r = requests.get('http://lyrics.wikia.com/{0}:{1}'.format(singer,song)) + s = BeautifulSoup(r.text,"lxml") + #Get main lyrics holder + lyrics = s.find("div",{'class':'lyricbox'}) + if lyrics is None: + raise ValueError("Song or Singer does not exist or the API does not have Lyrics") + return None + #Remove Scripts + [s.extract() for s in lyrics('script')] + + #Remove Comments + comments = lyrics.findAll(text=lambda text:isinstance(text, Comment)) + [comment.extract() for comment in comments] + + #Remove unecessary tags + for tag in ['div','i','b','a']: + for match in lyrics.findAll(tag): + match.replaceWithChildren() + #Get output as a string and remove non unicode characters and replace
with newlines + output = str(lyrics).encode('utf-8', errors='replace')[22:-6:].decode("utf-8").replace('\n','').replace('
','\n') + try: + return output + except: + return output.encode('utf-8') + +def main(): + albums = PyLyrics.getAlbums('OneRepublic') + print (albums) + tracks = PyLyrics.getTracks(albums[-1]) + print (tracks[7].getLyrics()) + + +if __name__=='__main__': + main() \ No newline at end of file diff --git a/build/lib.linux-x86_64-2.7/PyLyrics/tests.py b/build/lib.linux-x86_64-2.7/PyLyrics/tests.py new file mode 100644 index 0000000..0ee31d7 --- /dev/null +++ b/build/lib.linux-x86_64-2.7/PyLyrics/tests.py @@ -0,0 +1,22 @@ +import unittest +try: + from .__init__ import * #Python 3 +except: + from __init__ import * + +try: + basestring = basestring +except NameError: + basestring = (str, bytes) + +albums = PyLyrics.getAlbums('Taylor Swift') +class PyLyricsTest(unittest.TestCase): + def testAlbums(self): + self.assertIsInstance(albums,list) + def testTracks(self): + self.assertIsInstance(albums[0].tracks(),list) + def testLyrics(self): + self.assertIsInstance(PyLyrics.getLyrics('Eminem','The Monster'),basestring) + +if __name__=='__main__': + unittest.main() \ No newline at end of file From c9166bfac15f8bde6730f98622f33e9ed4dfdda4 Mon Sep 17 00:00:00 2001 From: Nathan Easton Date: Fri, 9 Dec 2016 12:02:59 -0500 Subject: [PATCH 2/5] Added lxml --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d47ab33..b70bb14 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ packages=['PyLyrics'], url="http://github.com/geekpradd/PyLyrics", install_requires=[ - 'beautifulsoup4','requests',], + 'beautifulsoup4','requests','lxml'], classifiers=[ "Development Status :: 5 - Production/Stable", "Topic :: Internet", From 54d81221b235bdc97e771360dafcedf4f6f039d0 Mon Sep 17 00:00:00 2001 From: Nathan Easton Date: Fri, 9 Dec 2016 12:06:26 -0500 Subject: [PATCH 3/5] Fixed currentAlbum not being referenced before use. --- PyLyrics/functions.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/PyLyrics/functions.py b/PyLyrics/functions.py index 5bd1336..aed8306 100644 --- a/PyLyrics/functions.py +++ b/PyLyrics/functions.py @@ -66,7 +66,11 @@ def getTracks(album): if al.text.lower().strip() == album.name.strip().lower(): currentAlbum = al break - songs =[Track(song.text,album,album.artist()) for song in currentAlbum.findNext('songs').findAll('item')] + if currentAlbum!=None: + currentAlbum="" + songs =[Track(song.text,album,album.artist()) for song in currentAlbum.findNext('songs').findAll('item')] + else: + songs =[Track(song.text,album,album.artist()) for song in currentAlbum.findNext('songs').findAll('item')] return songs @staticmethod From fc706fdaeea948e880c72601547a76b0a884827b Mon Sep 17 00:00:00 2001 From: Nathan Easton Date: Thu, 16 Feb 2017 13:34:47 -0500 Subject: [PATCH 4/5] Commented out a bad line --- PyLyrics/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyLyrics/functions.py b/PyLyrics/functions.py index aed8306..5699e5e 100644 --- a/PyLyrics/functions.py +++ b/PyLyrics/functions.py @@ -67,7 +67,7 @@ def getTracks(album): currentAlbum = al break if currentAlbum!=None: - currentAlbum="" + #currentAlbum="" songs =[Track(song.text,album,album.artist()) for song in currentAlbum.findNext('songs').findAll('item')] else: songs =[Track(song.text,album,album.artist()) for song in currentAlbum.findNext('songs').findAll('item')] From 52741b7ec57150e82e5c2ee00bbe60f701add1f3 Mon Sep 17 00:00:00 2001 From: T-Bone Date: Sat, 18 Mar 2017 15:30:39 -0400 Subject: [PATCH 5/5] Decodes UTF8 when pulling lyrics b/c Wikia has lots of weird characters --- PyLyrics/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyLyrics/functions.py b/PyLyrics/functions.py index 5699e5e..e23a906 100644 --- a/PyLyrics/functions.py +++ b/PyLyrics/functions.py @@ -97,7 +97,7 @@ def getLyrics(singer, song): for match in lyrics.findAll(tag): match.replaceWithChildren() #Get output as a string and remove non unicode characters and replace
with newlines - output = str(lyrics).encode('utf-8', errors='replace')[22:-6:].decode("utf-8").replace('\n','').replace('
','\n') + output = str(lyrics).decode('utf-8', errors='replace').encode('utf-8', errors='replace')[22:-6:].decode("utf-8").replace('\n','').replace('
','\n') try: return output except: