77 lines
2.1 KiB
Python
77 lines
2.1 KiB
Python
import re
|
|
|
|
from lxml import html, etree # type: ignore
|
|
|
|
import http_pool
|
|
|
|
|
|
# Matches not only latin symbols,
|
|
# but also cyrillic alphabet, Chinese hyeroglyphics...
|
|
word_regex = re.compile(r'\w+')
|
|
|
|
# Get all <div>s with lyrics
|
|
lyrics_xpath = etree.XPath('//div[@id="lyrics-root"][1]/div[@data-lyrics-container="true"]')
|
|
|
|
# Get <br>s to add newlines after them for correct text parsing
|
|
br_xpath = etree.XPath('.//br')
|
|
|
|
|
|
def search(title: str, artist: str) -> tuple[str, str]:
|
|
'''Searches for Genius lyrics using SearXNG + Yahoo
|
|
and returns the first result as tuple(title, url)'''
|
|
|
|
resp = http_pool.get().request(
|
|
'GET',
|
|
'https://searx.dc09.ru/search',
|
|
fields={
|
|
'q': artist + ' ' + title + ' site:genius.com',
|
|
'engines': 'brave,yahoo',
|
|
'safesearch': '0',
|
|
'format': 'json',
|
|
},
|
|
)
|
|
|
|
result: dict[str, str] = resp.json()['results'][0]
|
|
del resp
|
|
|
|
return (result['title'], result['url'])
|
|
|
|
|
|
def raise_on_irrelevant_result(res_title: str, track_track: str, track_artist: str) -> None:
|
|
'''Raises ValueError
|
|
if no words from track title are present in search result track title
|
|
and no words from artist name are present in search result artist name'''
|
|
|
|
res_artist, res_track = res_title.lower().split(' \u2013 ', maxsplit=1)
|
|
if not (
|
|
any(
|
|
word.group(0).lower() in res_artist
|
|
for word in word_regex.finditer(track_artist)
|
|
)
|
|
and
|
|
any(
|
|
word.group(0).lower() in res_track
|
|
for word in word_regex.finditer(track_track)
|
|
)
|
|
):
|
|
raise ValueError
|
|
|
|
|
|
def parse(url: str) -> str:
|
|
'''Requests a lyrics page and parses with libxml2,
|
|
leaving only text and line breaks'''
|
|
|
|
resp = http_pool.get().request('GET', url)
|
|
tree = html.document_fromstring(resp.data)
|
|
divs = lyrics_xpath(tree)
|
|
del resp, tree
|
|
|
|
return '\n'.join(_parse_text(div) for div in divs).strip()
|
|
|
|
|
|
def _parse_text(elem: html.HtmlElement) -> str:
|
|
|
|
for br in br_xpath(elem):
|
|
br.tail = '\n' + (br.tail or '')
|
|
|
|
return elem.text_content()
|