From ed775781a36bc5b5edb8e8ecc6633c9d41c78dc4 Mon Sep 17 00:00:00 2001
From: DarkCat09 <a.chechkenev@yandex.ru>
Date: Fri, 3 Feb 2023 20:59:13 +0400
Subject: [PATCH] ID3 tagger rewritten in Python, AZLyrics instead of Genius

---
 .gitignore        |   1 +
 .id3tag_helper.py | 403 ++++++++++++++++++++++++++++++++++++++++++++++
 .id3tag_helper.sh |  81 ----------
 id3tag.sh         |   2 +-
 requirements.txt  |   3 +
 5 files changed, 408 insertions(+), 82 deletions(-)
 create mode 100755 .id3tag_helper.py
 delete mode 100755 .id3tag_helper.sh
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
index e7b5387..c200e4f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ files/
 convert/
 tagged/
 .vscode/
+lyrics.txt
diff --git a/.id3tag_helper.py b/.id3tag_helper.py
new file mode 100755
index 0000000..421197e
--- /dev/null
+++ b/.id3tag_helper.py
@@ -0,0 +1,403 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import shutil
+from pathlib import Path
+
+import logging
+
+import mimetypes
+import subprocess
+
+from typing import TypedDict
+from typing import Optional
+
+import re
+
+import requests
+from bs4 import BeautifulSoup
+
+from mutagen.id3 import ID3
+from mutagen.id3 import TPE1, TIT2, TALB
+from mutagen.id3 import TYER, TRCK
+from mutagen.id3 import USLT, APIC
+
+BASEURL = 'https://www.azlyrics.com'
+USERAGENT = (
+    'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) '
+    'Gecko/20100101 Firefox/110.0'
+)
+
+LYRICS_ROW = '.main-page>.row>.col-xs-12'
+
+session = requests.Session()
+session.headers['User-Agent'] = USERAGENT
+
+
+class ParseResult(TypedDict):
+    title: str
+    artist: str
+    album: str
+    year: int
+    track_no: int
+    tracks: int
+    lyrics: str
+    cover: Optional[bytes]
+    cover_mime: Optional[str]
+
+
+class ParseError(Exception):
+
+    def __init__(self, parsing_obj: str) -> None:
+
+        super().__init__(
+            f'Unable to parse {parsing_obj}'
+        )
+
+
+def main() -> None:
+
+    copy = int(sys.argv[1]) == 1
+    file = sys.argv[2]
+
+    title = conv_title(file)
+    print(
+        'Enter new title to correct it, '
+        'or press Enter to continue',
+        '"!--" without quotes means that '
+        'you want to enter info and lyrics manually',
+        sep='\n',
+    )
+    print('Title:', title)
+    correct = input().strip()
+
+    parsed: Optional[ParseResult] = None
+
+    if correct == '!--':
+        parsed = manual_info_input()
+    else:
+        if correct != '':
+            title = correct.lower()
+        url = search_azurl(title)
+        print(url)
+        parsed = parse_azlyrics(url)
+
+    #print(parsed)
+    tagmp3(file, parsed, copy)
+
+
+def input_num(msg: str, def_: int = 0) -> int:
+
+    try:
+        return int(input(msg))
+    except ValueError:
+        return def_
+
+
+def conv_title(file: str) -> str:
+
+    # Remove file path
+    title = file \
+        .replace('./convert/', '') \
+        .replace('./files/', '')
+
+    # Remove a YT ID and an extension
+    title = re.sub(
+        r'-{3}[\w_-]*\.[\w_-]*',
+        '', title,
+    )
+
+    # Remove "(Official Audio)"
+    title = re.sub(
+        r'\(.*\)',
+        '', title,
+    )
+
+    # underscore -> space
+    title = title \
+        .replace('_', ' ') \
+        .strip() \
+        .lower()
+
+    return title
+
+
+def search_azurl(title: str) -> str:
+
+    print('Searching...')
+    
+    page = session.get(
+        'https://searx.dc09.ru/search',
+        params={
+            'q': f'{title} site:azlyrics.com',
+            'category_general': 1,
+            'language': 'ru-RU',
+            'time_range': '',
+            'safesearch': 0,
+            'theme': 'simple',
+        },
+    )
+
+    soup = BeautifulSoup(page.text, 'html.parser')
+    link = soup.select_one(
+        'div#urls>article>h3>a[href*="azlyrics.com/lyrics/"]'
+    )
+
+    if link is None:
+        raise ParseError('song URL')
+    
+    return str(link.get('href'))
+
+
+def parse_azlyrics(link: str) -> ParseResult:
+
+    result = ParseResult(
+        title='', artist='',
+        album='', year=0,
+        track_no=0, tracks=0,
+        lyrics='',
+        cover=None,
+        cover_mime=None,
+    )
+
+    print('Please wait...')
+
+    page = session.get(link)
+    soup = BeautifulSoup(page.text, 'html.parser')
+    
+    lyrics = soup.select_one(
+        f'{LYRICS_ROW}>div'
+        ':not(.div-share)'
+        ':not(.lyricsh)'
+        ':not(.ringtone)'
+    )
+    if lyrics is None:
+        raise ParseError('song lyrics')
+    result['lyrics'] = lyrics.get_text().strip()
+
+    artist_elem = soup.select_one(f'{LYRICS_ROW}>.lyricsh>h2')
+    if artist_elem is None:
+        print('Unable to parse artist name')
+        result['artist'] = input('Enter the artist name: ')
+    else:
+        result['artist'] = artist_elem.get_text() \
+            .removesuffix(' Lyrics') \
+            .strip()
+
+    title_elem = soup.select_one(f'{LYRICS_ROW}>b')
+    if title_elem is None:
+        print('Unable to parse song title')
+        result['title'] = input('Enter the title: ')
+    else:
+        result['title'] = title_elem.get_text().strip('" ')
+
+    album_blocks = soup.select('.songinalbum_title')
+    album = None
+
+    if len(album_blocks) > 1:
+        album = album_blocks[-2]
+
+    elif len(album_blocks) > 0:
+        album = album_blocks[0]
+    
+    if album is None:
+        album_re = None
+    else:
+        album_re = re.search(
+            r'album:\s*"(.+?)"\s*\((\d+)\)',
+            album.get_text()
+        )
+
+    if album_re is None:
+        print('Unable to parse album name')
+        result['album'] = input('Enter the album name: ')
+        result['year'] = input_num('Enter the release year: ')
+        result['track_no'] = input_num('This is the track #')
+        result['tracks'] = input_num('Number of tracks in the album: ')
+
+        cover = input('Insert an album cover? [Y/n] ')
+        if cover.lower() not in ('n','н'):
+            try:
+                print(
+                    'Download the cover and enter its path:',
+                    '(relative path is not recommended)',
+                    sep='\n',
+                )
+                cover_file = Path(input().strip())
+
+                with cover_file.open('rb') as f:
+                    result['cover'] = f.read()
+
+                result['cover_mime'] = (
+                    mimetypes.guess_type(cover_file)[0]
+                    or 'image/jpeg'
+                )
+            except Exception as err:
+                logging.exception(err)
+
+    else:
+        result['album'] = album_re[1]
+        result['year'] = int(album_re[2])
+
+        assert album is not None
+        cover = album.select_one('img.album-image')
+
+        if cover is not None:
+
+            cover_url = str(cover.get('src'))
+            if cover_url.startswith('/'):
+                cover_url = BASEURL + cover_url
+
+            req = session.get(cover_url)
+            result['cover'] = req.content
+            result['cover_mime'] = req.headers.get(
+                'Content-Type', 'image/jpeg'
+            )
+    
+        tracklist_elem = soup.select_one('.songlist-panel')
+        if tracklist_elem is not None:
+
+            tracklist = tracklist_elem.select(
+                '.listalbum-item'
+            )
+            result['tracks'] = len(tracklist)
+
+            current_url = re.search(
+                r'/(lyrics/.+?\.html)',
+                link,
+            )
+
+            result['track_no'] = 0
+            if current_url is not None:
+                for i, track in enumerate(tracklist):
+
+                    track_url = track.select_one('a')
+                    if track_url is None:
+                        continue
+
+                    track_href = str(track_url.get('href'))
+                    if current_url[0] in track_href:
+                        result['track_no'] = (i + 1)
+                        break
+
+    return result
+
+
+def manual_info_input() -> ParseResult:
+
+    result = ParseResult(
+        title=input('Song title: '),
+        artist=input('Artist name: '),
+        album=input('Album name: '),
+        year=input_num('Release year: '),
+        track_no=input_num('Track #'),
+        tracks=input_num('Tracks in album: '),
+        lyrics='', cover=None, cover_mime=None,
+    )
+
+    editor = os.getenv('EDITOR', 'nano')
+    print('Now, paste the lyrics into a text editor')
+    print(f'Default editor: {editor}')
+    print('Enter another or press Enter to continue')
+    other_editor = input().strip()
+
+    if other_editor != '':
+        editor = other_editor
+
+    try:
+        lyrics_file = Path('.') / 'lyrics.txt'
+        with lyrics_file.open('wt') as f:
+            f.write('\n')
+
+        subprocess.call([
+            editor,
+            lyrics_file,
+        ])
+
+        print('Reading file...')
+        with open('lyrics.txt', 'rt', encoding='utf-8') as f:
+            result['lyrics'] = f.read().strip()
+        print('Done')
+
+    except OSError as err:
+        logging.exception(err)
+
+    cover = input('Insert an album cover? [Y/n] ')
+    if cover.lower() not in ('n','н'):
+        try:
+            print(
+                'Download the cover and enter its path:',
+                '(relative path is not recommended)',
+                sep='\n',
+            )
+            cover_file = Path(input().strip())
+
+            with cover_file.open('rb') as f:
+                result['cover'] = f.read()
+
+            result['cover_mime'] = (
+                mimetypes.guess_type(cover_file)[0]
+                or 'image/jpeg'
+            )
+        except Exception as err:
+            logging.exception(err)
+    
+    return result
+
+
+def tagmp3(
+        file: str,
+        parsed: ParseResult,
+        copy: bool) -> None:
+
+    oldpath = Path(file)
+    newpath = oldpath
+    
+    if copy:
+
+        newdir = (
+            Path('./tagged') /
+            parsed['artist'] /
+            parsed['album']
+        )
+        os.makedirs(newdir, exist_ok=True)
+
+        newpath = newdir / (
+            f"{parsed['track_no']}. " +
+            f"{parsed['title']}.mp3"
+        )
+        shutil.copy(oldpath, newpath)
+
+        if parsed['cover'] is not None:
+
+            ext = mimetypes.guess_extension(
+                parsed['cover_mime'] or ''
+            ) or '.jpg'
+
+            cover = newdir / f'cover{ext}'
+            with cover.open('wb') as f:
+                f.write(parsed['cover'])
+    
+    id3 = ID3(str(newpath))
+    id3['TPE1'] = TPE1(text=parsed['artist'])
+    id3['TIT2'] = TIT2(text=parsed['title'])
+    id3['TALB'] = TALB(text=parsed['album'])
+    id3['TYER'] = TYER(text=f"{parsed['year']}")
+    id3['TRCK'] = TRCK(
+        text=(
+            f"{parsed['track_no']}/"
+            f"{parsed['tracks']}"
+        )
+    )
+    id3['USLT'] = USLT(text=parsed['lyrics'])
+    if parsed['cover'] is not None:
+        id3['APIC'] = APIC(
+            data=parsed['cover'],
+            mime=parsed['cover_mime'],
+        )
+    id3.save()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/.id3tag_helper.sh b/.id3tag_helper.sh
deleted file mode 100755
index 0c8e2a0..0000000
--- a/.id3tag_helper.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env bash
-# Should not be called manually
-
-conv_title () {
-    echo "$1" | \
-    # remove directory name
-    sed -E 's#\./(convert|files)/##' | \
-    # remove video ID and ext
-    sed 's/---[A-Za-z0-9_-]*\.[A-Za-z0-9]*//' | \
-    # underscore -> space
-    sed 's/_/ /g' | \
-    # remove "(Official Audio)"
-    sed -E 's/\(.*\)//' | \
-    # trim spaces
-    xargs
-}
-
-rm_quotes () {
-    cat | sed 's/^"//' | sed 's/"$//'
-}
-
-title=$(conv_title "$2")
-title_prev="$title"
-
-echo 'Correct the song title if needed:'
-read -e -r -i "$title" title
-title=${title:-$title_prev}
-
-echo 'Searching on Genius'
-
-link="https://genius.com/api/search/multi?q=${title// /%20}"
-echo "URL: $link"
-
-song=$(curl -sL "$link" | jq '.response.sections[1].hits[0].result')
-title=$(echo "$song" | jq '.title' | rm_quotes)
-artist=$(echo "$song" | jq '.primary_artist.name' | rm_quotes)
-year=$(echo "$song" | jq '.release_date_components.year')
-page_url=$(echo "$song" | jq '.url' | rm_quotes)
-
-echo "Title: $title"
-echo "Artist: $artist"
-echo "Lyrics: $page_url"
-
-echo 'Parsing lyrics page'
-
-page=$(curl -sL "$page_url")
-album=$(echo "$page" | pup -p 'a[class^="PrimaryAlbum__Title"] text{}' | sed -E 's#\([0-9]+\)$##' | xargs)
-tracknum=$(echo "$page" | pup -p 'div[class^="HeaderTracklist__AlbumWrapper"]' | grep -oE 'Track [0-9]+' | grep -oE '[0-9]+')
-trackall=$(echo "$page" | pup -p 'ol[class^="AlbumTracklist__Container"] > li:last-child div[class^="AlbumTracklist__TrackNumber"] text{}' | grep -oE '[0-9]+')
-lyrics=$(echo "$page" | pup -p 'div[data-lyrics-container="true"] text{}' | sed 's#^\[#\n[#')
-
-# remove first blank line
-if [[ $(echo "$lyrics" | sed -n '1p') == "" ]]; then
-    lyrics=$(echo "$lyrics" | sed '1d')
-fi
-
-echo "Album: $album"
-
-if [[ $1 == 1 ]]; then
-    newdir="./tagged/${artist}/${album}"
-    mkdir -p "$newdir"
-    newpath="${newdir}/${tracknum}. ${title}.mp3"
-    echo "Copying to $newpath"
-    cp -f "$2" "$newpath"
-else
-    newpath="$2"
-fi
-
-echo
-echo "$lyrics"
-echo
-
-echo 'Adding ID3v2 tags'
-mid3v2 \
---artist "$artist" \
---album "$album" \
---song "$title" \
---year "$year" \
---track "${tracknum}/${trackall}" \
---USLT "$lyrics" \
-"$newpath"
diff --git a/id3tag.sh b/id3tag.sh
index 376c59f..4d680a7 100755
--- a/id3tag.sh
+++ b/id3tag.sh
@@ -19,4 +19,4 @@ copy_arg=$(ask)
 echo
 
 find "$directory" -type f -name "*.mp3" -exec \
-bash ./.id3tag_helper.sh "$copy_arg" {} \;
+python3 ./.id3tag_helper.py "$copy_arg" {} \;
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..05052dc
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+mutagen
+requests
+bs4