From eb9bada544b87d96ad81e927d8f02c140c217bca Mon Sep 17 00:00:00 2001 From: DarkCat09 Date: Sat, 27 Apr 2024 22:08:20 +0400 Subject: [PATCH] Genius lyrics parser (it works!) --- genius.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ test_genius.py | 42 +++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 genius.py create mode 100644 test_genius.py diff --git a/genius.py b/genius.py new file mode 100644 index 0000000..a839d6a --- /dev/null +++ b/genius.py @@ -0,0 +1,56 @@ +import re + +from lxml import html, etree # type: ignore +import urllib3 + + +# Matches not only latin symbols, +# but also cyrillic alphabet, Chinese hyeroglyphics... +word_regex = re.compile(r'\w+') + +# Get all
s with lyrics +lyrics_xpath = etree.XPath('//div[@id="lyrics-root"][1]/div[@data-lyrics-container="true"]') + +# Get
s to add newlines after them for correct text parsing +br_xpath = etree.XPath('.//br') + + +def search(http: urllib3.PoolManager, title: str, artist: str) -> str: + '''Searches for Genius lyrics using SearXNG + Yahoo and returns the first URL. + Irrelevant texts should be picked manually''' + + resp = http.request( + 'GET', + 'https://searx.dc09.ru/search', + fields={ + 'q': artist + ' ' + title + ' site:genius.com', + 'engines': 'yahoo', + 'safesearch': '0', + 'format': 'json', + }, + ) + + result: dict[str, str] = resp.json()['results'][0] + del resp + + return result['url'] + + +def parse(http: urllib3.PoolManager, url: str) -> str: + '''Requests a lyrics page and parses with libxml2, + leaving only text and line breaks''' + + resp = http.request('GET', url) + tree = html.document_fromstring(resp.data) + divs = lyrics_xpath(tree) + del resp, tree + + return '\n'.join(_parse_text(div) for div in divs).strip() + + +def _parse_text(elem: html.HtmlElement) -> str: + + for br in br_xpath(elem): + br.tail = '\n' + (br.tail or '') + + return elem.text_content() diff --git a/test_genius.py b/test_genius.py new file mode 100644 index 0000000..34ee5d2 --- /dev/null +++ b/test_genius.py @@ -0,0 +1,42 @@ +from unittest import TestCase +import urllib3 + +import genius + + +TITLE = 'A Line In The Sand' +ARTIST = 'Linkin Park' +URL = 'https://genius.com/Linkin-park-a-line-in-the-sand-lyrics' + +LYR1 = '''[Intro: Mike Shinoda] +Today, we stood on the wall''' + +LYR2 = '''little did we know + +[Instrumental Break] + +[Verse 1:''' + +LYR3 = '''you are gonna get yours + +[Chorus: Chester Bennington] +Another day''' + + +class TestGenius(TestCase): + + def setUp(self) -> None: + self.http = urllib3.PoolManager() + + def test_search_success(self) -> None: + url = genius.search(self.http, TITLE, ARTIST) + self.assertEqual(url, URL) + + def test_lyrics_parsing(self) -> None: + lyrics = genius.parse(self.http, URL) + self.assertTrue(lyrics.startswith(LYR1)) + self.assertTrue(LYR2 in lyrics) + self.assertTrue(LYR3 in lyrics) + + def tearDown(self) -> None: + self.http.clear()