Lyrics: switch to multiple search engines

Brave sometimes ratelimits SearXNG server
Lyrics: add irrelevant results filtering
2024-05-06 20:34:03 +04:00 · 2024-05-06 20:31:42 +04:00 · 2024-05-06 19:43:32 +04:00 · 2024-05-05 19:44:37 +04:00
8 changed files with 122 additions and 14 deletions
--- a/backend/config.py
+++ b/backend/config.py
@ -13,10 +13,6 @@ class Config:
        # Cookies are in Netscape CSV format, see yt-dlp docs
        self.cookies_dir = Path(os.getenv('COOKIES_DIR') or 'cookies')
        # Note: yt-dlp's path trimmer also counts album_path_tmpl, not only filename
        # Why 235? 255 is the ext4 limit. 255 - len("/var/lib/musicdlp/") = 237, rounded down to 235
        self.path_length = int(os.getenv('PATH_LENGTH') or 235)
        self.tmpl = os.path.join(
            # `artists.0` instead of `artist`, because the latter can contain "feat. ..."
            os.getenv('ALBUM_PATH_TMPL') or 'music/%(artists.0)s/%(album)s',
--- a/backend/genius.py
+++ b/backend/genius.py
@ -16,16 +16,16 @@ lyrics_xpath = etree.XPath('//div[@id="lyrics-root"][1]/div[@data-lyrics-contain
 br_xpath = etree.XPath('.//br')
-def search(title: str, artist: str) -> str:
+def search(title: str, artist: str) -> tuple[str, str]:
-    '''Searches for Genius lyrics using SearXNG + Yahoo and returns the first URL.
+    '''Searches for Genius lyrics using SearXNG + Yahoo
-    Irrelevant texts should be picked manually'''
+    and returns the first result as tuple(title, url)'''
    resp = http_pool.get().request(
        'GET',
        'https://searx.dc09.ru/search',
        fields={
            'q': artist + ' ' + title + ' site:genius.com',
-            'engines': 'brave',
+            'engines': 'brave,yahoo',
            'safesearch': '0',
            'format': 'json',
        },
@ -34,7 +34,27 @@ def search(title: str, artist: str) -> str:
    result: dict[str, str] = resp.json()['results'][0]
    del resp
-    return result['url']
+    return (result['title'], result['url'])
 def raise_on_irrelevant_result(res_title: str, track_track: str, track_artist: str) -> None:
    '''Raises ValueError
    if no words from track title are present in search result track title
    and no words from artist name are present in search result artist name'''
    res_artist, res_track = res_title.lower().split(' \u2013 ', maxsplit=1)
    if not (
        any(
            word.group(0).lower() in res_artist
            for word in word_regex.finditer(track_artist)
        )
        and
        any(
            word.group(0).lower() in res_track
            for word in word_regex.finditer(track_track)
        )
    ):
        raise ValueError
 def parse(url: str) -> str:
--- a/backend/id3pp.py
+++ b/backend/id3pp.py
@ -63,7 +63,8 @@ class ID3TagsPP(PostProcessor):
            file['TCON'] = id3.TCON(encoding=ENC_UTF8, text=information['genre'])
        try:
-            lyr_url = genius.search(title, artists[0])
+            lyr_title, lyr_url = genius.search(title, artists[0])
            genius.raise_on_irrelevant_result(lyr_title, title, artists[0])
            file['USLT'] = id3.USLT(encoding=ENC_UTF8, text=genius.parse(lyr_url))
        except:
            pass
--- a/backend/test_genius.py
+++ b/backend/test_genius.py
@ -23,15 +23,46 @@ LYR3 = '''you are gonna get yours
 Another day'''
 # There is no lyrics for this song on Genius
 # Maybe someday TITLE2 and ARTIST2 will need to be changed
 # (But really existing song is chosen intentionally)
 TITLE2 = 'Паруса'
 ARTIST2 = 'PIZZA'
 class TestGenius(TestCase):
    def setUp(self) -> None:
        http_pool.get()
-    def test_search_success(self) -> None:
+    def test_search(self) -> None:
-        url = genius.search(TITLE, ARTIST)
+        _, url = genius.search(TITLE, ARTIST)
        self.assertEqual(url, URL)
    def test_search_success(self) -> None:
        title, _ = genius.search(TITLE, ARTIST)
        genius.raise_on_irrelevant_result(title, TITLE, ARTIST)
    def test_search_failure(self) -> None:
        title, _ = genius.search(TITLE2, ARTIST2)
        with self.assertRaises(ValueError):
            genius.raise_on_irrelevant_result(title, TITLE2, ARTIST2)
    def test_relevancy_success(self) -> None:
        genius.raise_on_irrelevant_result(
            'ABC hEllo world!@ \u2013 sOmE artist123',
            'Artist123',
            'hello World',
        )
    def test_relevancy_failure(self) -> None:
        with self.assertRaises(ValueError):
            genius.raise_on_irrelevant_result(
                'DEF hEllo world@!15 \u2013 anOther artist456',
                'DEF 789',
                'ABC irrelevant track title',
            )
    def test_lyrics_parsing(self) -> None:
        lyrics = genius.parse(URL)
        self.assertTrue(lyrics.startswith(LYR1))
--- a/backend/ydl_pool.py
+++ b/backend/ydl_pool.py
@ -63,7 +63,6 @@ class Downloader:
        if ydl is None:
            ydl = create_ydl_fn[site]()
            ydl.params['trim_file_name'] = cfg.path_length  # Note: not only filename, but path in outtmpl
            ydl.params['outtmpl']['default'] = cfg.tmpl
            ydl.add_post_processor(id3pp.ID3TagsPP(), when='post_process')
--- a/frontend/index.html
+++ b/frontend/index.html
@ -1 +1,33 @@
-<!-- TODO: simple web ui, websockets -->
+<!DOCTYPE html>
 <html lang="en">
  <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>musicdlp</title>
    <link rel="stylesheet" href="/style.css">
    <script src="/script.js"></script>
  </head>
  <body>
    <div>
      <input type="text" id="url" placeholder="Playlist or track URL">
      <button type="button" id="guess-site-btn">Guess site</button>
    </div>
    <div>
      <select id="site-select">
        <option value="youtube" selected>YouTube</option>
        <option value="yt_proxied">YT proxied</option>
        <option value="yandex">Yandex Music</option>
      </select>
    </div>
    <div>
      <button type="button" id="items-btn">Get playlist items</button>
    </div>
    <div id="items-container"></div>
    <div>
      <button type="button">Download</button>
    </div>
    <div>
      <label>Progress: <span id="progress">not implemented</span></label>
    </div>
  </body>
 </html>
--- a/frontend/script.js
+++ b/frontend/script.js
@ -0,0 +1,18 @@
 addEventListener('DOMContentLoaded', () => {
  /** @type{HTMLInputElement} */
  const urlField = document.getElementById('url')
  /** @type{HTMLSelectElement} */
  const site = document.getElementById('site-select')
  document.getElementById('guess-site-btn').addEventListener('click', () => {
    const url = urlField.value
    if (url.includes('/watch?v=') || url.includes('/playlist?list=')) {
      if (site.value == 'yt_proxied') {
        return
      }
      site.value = 'youtube'
    } else if (url.includes('://music.yandex.')) {
      site.value = 'yandex'
    }
  })
 })
--- a/frontend/style.css
+++ b/frontend/style.css
@ -0,0 +1,11 @@
 body {
  margin: 0;
  padding: 0.5rem;
  display: flex;
  flex-direction: column;
  align-items: center;
  row-gap: 0.25rem;
  font-family: 'Noto Sans', 'Roboto', 'Ubuntu', sans-serif;
 }
Author	SHA1	Message	Date
DarkCat09	474c24e05a	Lyrics: switch to multiple search engines Brave sometimes ratelimits SearXNG server	2024-05-06 20:34:03 +04:00
DarkCat09	0b0759fb3b	Lyrics: add irrelevant results filtering Small note on how it works, copied from raise_on_irrelevant_result() docstring: Raises ValueError if no words from track title are present in search result track title and no words from artist name are present in search result artist name	2024-05-06 20:31:42 +04:00
DarkCat09	62ebecc87f	Remove `path_length` config option I've misunderstood the FS limits, 255 is for filename, not path. Path is limited only by libc, it's 4096. The default outtmpl in musicdlp contains slashes, so it's a path. But YDL considers it to be a filename, so the whole outtmpl formatting result is trimmed to path_length. Do we really need this? I think there are no "malicious" long-named tracks :)	2024-05-06 19:43:32 +04:00
DarkCat09	e1ef74cc1c	Simpliest frontend UI	2024-05-05 19:44:37 +04:00