musicdlp/backend/genius.py

import re

from lxml import html, etree  # type: ignore
import urllib3


# Matches not only latin symbols,
# but also cyrillic alphabet, Chinese hyeroglyphics...
word_regex = re.compile(r'\w+')

# Get all <div>s with lyrics
lyrics_xpath = etree.XPath('//div[@id="lyrics-root"][1]/div[@data-lyrics-container="true"]')

# Get <br>s to add newlines after them for correct text parsing
br_xpath = etree.XPath('.//br')


def search(http: urllib3.PoolManager, title: str, artist: str) -> str:
    '''Searches for Genius lyrics using SearXNG + Yahoo and returns the first URL.
    Irrelevant texts should be picked manually'''

    resp = http.request(
        'GET',
        'https://searx.dc09.ru/search',
        fields={
            'q': artist + ' ' + title + ' site:genius.com',
            'engines': 'yahoo',
            'safesearch': '0',
            'format': 'json',
        },
    )

    result: dict[str, str] = resp.json()['results'][0]
    del resp

    return result['url']


def parse(http: urllib3.PoolManager, url: str) -> str:
    '''Requests a lyrics page and parses with libxml2,
    leaving only text and line breaks'''

    resp = http.request('GET', url)
    tree = html.document_fromstring(resp.data)
    divs = lyrics_xpath(tree)
    del resp, tree

    return '\n'.join(_parse_text(div) for div in divs).strip()


def _parse_text(elem: html.HtmlElement) -> str:

    for br in br_xpath(elem):
        br.tail = '\n' + (br.tail or '')

    return elem.text_content()