import re from lxml import html, etree # type: ignore import http_pool # Matches not only latin symbols, # but also cyrillic alphabet, Chinese hyeroglyphics... word_regex = re.compile(r'\w+') # Get all
s with lyrics lyrics_xpath = etree.XPath('//div[@id="lyrics-root"][1]/div[@data-lyrics-container="true"]') # Get
s to add newlines after them for correct text parsing br_xpath = etree.XPath('.//br') def search(title: str, artist: str) -> tuple[str, str]: '''Searches for Genius lyrics using SearXNG + Yahoo and returns the first result as tuple(title, url)''' resp = http_pool.get().request( 'GET', 'https://searx.dc09.ru/search', fields={ 'q': artist + ' ' + title + ' site:genius.com', 'engines': 'brave', 'safesearch': '0', 'format': 'json', }, ) result: dict[str, str] = resp.json()['results'][0] del resp return (result['title'], result['url']) def raise_on_irrelevant_result(res_title: str, track_track: str, track_artist: str) -> None: '''Raises ValueError if no words from track title are present in search result track title and no words from artist name are present in search result artist name''' res_artist, res_track = res_title.lower().split(' \u2013 ', maxsplit=1) if not ( any( word.group(0).lower() in res_artist for word in word_regex.finditer(track_artist) ) and any( word.group(0).lower() in res_track for word in word_regex.finditer(track_track) ) ): raise ValueError def parse(url: str) -> str: '''Requests a lyrics page and parses with libxml2, leaving only text and line breaks''' resp = http_pool.get().request('GET', url) tree = html.document_fromstring(resp.data) divs = lyrics_xpath(tree) del resp, tree return '\n'.join(_parse_text(div) for div in divs).strip() def _parse_text(elem: html.HtmlElement) -> str: for br in br_xpath(elem): br.tail = '\n' + (br.tail or '') return elem.text_content()