Genius lyrics parser (it works!)

2024-04-27 22:08:20 +04:00 · 2024-04-27 22:08:20 +04:00 · eb9bada544
commit eb9bada544
parent 89b8f154fc
2 changed files with 98 additions and 0 deletions
--- a/genius.py
+++ b/genius.py
@ -0,0 +1,56 @@
 import re
 from lxml import html, etree  # type: ignore
 import urllib3
 # Matches not only latin symbols,
 # but also cyrillic alphabet, Chinese hyeroglyphics...
 word_regex = re.compile(r'\w+')
 # Get all <div>s with lyrics
 lyrics_xpath = etree.XPath('//div[@id="lyrics-root"][1]/div[@data-lyrics-container="true"]')
 # Get <br>s to add newlines after them for correct text parsing
 br_xpath = etree.XPath('.//br')
 def search(http: urllib3.PoolManager, title: str, artist: str) -> str:
    '''Searches for Genius lyrics using SearXNG + Yahoo and returns the first URL.
    Irrelevant texts should be picked manually'''
    resp = http.request(
        'GET',
        'https://searx.dc09.ru/search',
        fields={
            'q': artist + ' ' + title + ' site:genius.com',
            'engines': 'yahoo',
            'safesearch': '0',
            'format': 'json',
        },
    )
    result: dict[str, str] = resp.json()['results'][0]
    del resp
    return result['url']
 def parse(http: urllib3.PoolManager, url: str) -> str:
    '''Requests a lyrics page and parses with libxml2,
    leaving only text and line breaks'''
    resp = http.request('GET', url)
    tree = html.document_fromstring(resp.data)
    divs = lyrics_xpath(tree)
    del resp, tree
    return '\n'.join(_parse_text(div) for div in divs).strip()
 def _parse_text(elem: html.HtmlElement) -> str:
    for br in br_xpath(elem):
        br.tail = '\n' + (br.tail or '')
    return elem.text_content()
--- a/test_genius.py
+++ b/test_genius.py
@ -0,0 +1,42 @@
 from unittest import TestCase
 import urllib3
 import genius
 TITLE = 'A Line In The Sand'
 ARTIST = 'Linkin Park'
 URL = 'https://genius.com/Linkin-park-a-line-in-the-sand-lyrics'
 LYR1 = '''[Intro: Mike Shinoda]
 Today, we stood on the wall'''
 LYR2 = '''little did we know
 [Instrumental Break]
 [Verse 1:'''
 LYR3 = '''you are gonna get yours
 [Chorus: Chester Bennington]
 Another day'''
 class TestGenius(TestCase):
    def setUp(self) -> None:
        self.http = urllib3.PoolManager()
    def test_search_success(self) -> None:
        url = genius.search(self.http, TITLE, ARTIST)
        self.assertEqual(url, URL)
    def test_lyrics_parsing(self) -> None:
        lyrics = genius.parse(self.http, URL)
        self.assertTrue(lyrics.startswith(LYR1))
        self.assertTrue(LYR2 in lyrics)
        self.assertTrue(LYR3 in lyrics)
    def tearDown(self) -> None:
        self.http.clear()