Genius lyrics parser (it works!)

This commit is contained in:
DarkCat09 2024-04-27 22:08:20 +04:00
parent 89b8f154fc
commit eb9bada544
Signed by: DarkCat09
GPG key ID: 0A26CD5B3345D6E3
2 changed files with 98 additions and 0 deletions

56
genius.py Normal file
View file

@ -0,0 +1,56 @@
import re
from lxml import html, etree # type: ignore
import urllib3
# Matches not only latin symbols,
# but also cyrillic alphabet, Chinese hyeroglyphics...
word_regex = re.compile(r'\w+')
# Get all <div>s with lyrics
lyrics_xpath = etree.XPath('//div[@id="lyrics-root"][1]/div[@data-lyrics-container="true"]')
# Get <br>s to add newlines after them for correct text parsing
br_xpath = etree.XPath('.//br')
def search(http: urllib3.PoolManager, title: str, artist: str) -> str:
'''Searches for Genius lyrics using SearXNG + Yahoo and returns the first URL.
Irrelevant texts should be picked manually'''
resp = http.request(
'GET',
'https://searx.dc09.ru/search',
fields={
'q': artist + ' ' + title + ' site:genius.com',
'engines': 'yahoo',
'safesearch': '0',
'format': 'json',
},
)
result: dict[str, str] = resp.json()['results'][0]
del resp
return result['url']
def parse(http: urllib3.PoolManager, url: str) -> str:
'''Requests a lyrics page and parses with libxml2,
leaving only text and line breaks'''
resp = http.request('GET', url)
tree = html.document_fromstring(resp.data)
divs = lyrics_xpath(tree)
del resp, tree
return '\n'.join(_parse_text(div) for div in divs).strip()
def _parse_text(elem: html.HtmlElement) -> str:
for br in br_xpath(elem):
br.tail = '\n' + (br.tail or '')
return elem.text_content()

42
test_genius.py Normal file
View file

@ -0,0 +1,42 @@
from unittest import TestCase
import urllib3
import genius
TITLE = 'A Line In The Sand'
ARTIST = 'Linkin Park'
URL = 'https://genius.com/Linkin-park-a-line-in-the-sand-lyrics'
LYR1 = '''[Intro: Mike Shinoda]
Today, we stood on the wall'''
LYR2 = '''little did we know
[Instrumental Break]
[Verse 1:'''
LYR3 = '''you are gonna get yours
[Chorus: Chester Bennington]
Another day'''
class TestGenius(TestCase):
def setUp(self) -> None:
self.http = urllib3.PoolManager()
def test_search_success(self) -> None:
url = genius.search(self.http, TITLE, ARTIST)
self.assertEqual(url, URL)
def test_lyrics_parsing(self) -> None:
lyrics = genius.parse(self.http, URL)
self.assertTrue(lyrics.startswith(LYR1))
self.assertTrue(LYR2 in lyrics)
self.assertTrue(LYR3 in lyrics)
def tearDown(self) -> None:
self.http.clear()