DarkCat09
0b0759fb3b
Small note on how it works, copied from raise_on_irrelevant_result() docstring: Raises ValueError if no words from track title are present in search result track title and no words from artist name are present in search result artist name
77 lines
2.1 KiB
Python
77 lines
2.1 KiB
Python
import re
|
|
|
|
from lxml import html, etree # type: ignore
|
|
|
|
import http_pool
|
|
|
|
|
|
# Matches not only latin symbols,
|
|
# but also cyrillic alphabet, Chinese hyeroglyphics...
|
|
word_regex = re.compile(r'\w+')
|
|
|
|
# Get all <div>s with lyrics
|
|
lyrics_xpath = etree.XPath('//div[@id="lyrics-root"][1]/div[@data-lyrics-container="true"]')
|
|
|
|
# Get <br>s to add newlines after them for correct text parsing
|
|
br_xpath = etree.XPath('.//br')
|
|
|
|
|
|
def search(title: str, artist: str) -> tuple[str, str]:
|
|
'''Searches for Genius lyrics using SearXNG + Yahoo
|
|
and returns the first result as tuple(title, url)'''
|
|
|
|
resp = http_pool.get().request(
|
|
'GET',
|
|
'https://searx.dc09.ru/search',
|
|
fields={
|
|
'q': artist + ' ' + title + ' site:genius.com',
|
|
'engines': 'brave',
|
|
'safesearch': '0',
|
|
'format': 'json',
|
|
},
|
|
)
|
|
|
|
result: dict[str, str] = resp.json()['results'][0]
|
|
del resp
|
|
|
|
return (result['title'], result['url'])
|
|
|
|
|
|
def raise_on_irrelevant_result(res_title: str, track_track: str, track_artist: str) -> None:
|
|
'''Raises ValueError
|
|
if no words from track title are present in search result track title
|
|
and no words from artist name are present in search result artist name'''
|
|
|
|
res_artist, res_track = res_title.lower().split(' \u2013 ', maxsplit=1)
|
|
if not (
|
|
any(
|
|
word.group(0).lower() in res_artist
|
|
for word in word_regex.finditer(track_artist)
|
|
)
|
|
and
|
|
any(
|
|
word.group(0).lower() in res_track
|
|
for word in word_regex.finditer(track_track)
|
|
)
|
|
):
|
|
raise ValueError
|
|
|
|
|
|
def parse(url: str) -> str:
|
|
'''Requests a lyrics page and parses with libxml2,
|
|
leaving only text and line breaks'''
|
|
|
|
resp = http_pool.get().request('GET', url)
|
|
tree = html.document_fromstring(resp.data)
|
|
divs = lyrics_xpath(tree)
|
|
del resp, tree
|
|
|
|
return '\n'.join(_parse_text(div) for div in divs).strip()
|
|
|
|
|
|
def _parse_text(elem: html.HtmlElement) -> str:
|
|
|
|
for br in br_xpath(elem):
|
|
br.tail = '\n' + (br.tail or '')
|
|
|
|
return elem.text_content()
|