import re from lxml import html, etree # type: ignore import urllib3 # Matches not only latin symbols, # but also cyrillic alphabet, Chinese hyeroglyphics... word_regex = re.compile(r'\w+') # Get all
s with lyrics lyrics_xpath = etree.XPath('//div[@id="lyrics-root"][1]/div[@data-lyrics-container="true"]') # Get
s to add newlines after them for correct text parsing br_xpath = etree.XPath('.//br') def search(http: urllib3.PoolManager, title: str, artist: str) -> str: '''Searches for Genius lyrics using SearXNG + Yahoo and returns the first URL. Irrelevant texts should be picked manually''' resp = http.request( 'GET', 'https://searx.dc09.ru/search', fields={ 'q': artist + ' ' + title + ' site:genius.com', 'engines': 'yahoo', 'safesearch': '0', 'format': 'json', }, ) result: dict[str, str] = resp.json()['results'][0] del resp return result['url'] def parse(http: urllib3.PoolManager, url: str) -> str: '''Requests a lyrics page and parses with libxml2, leaving only text and line breaks''' resp = http.request('GET', url) tree = html.document_fromstring(resp.data) divs = lyrics_xpath(tree) del resp, tree return '\n'.join(_parse_text(div) for div in divs).strip() def _parse_text(elem: html.HtmlElement) -> str: for br in br_xpath(elem): br.tail = '\n' + (br.tail or '') return elem.text_content()