ID3 tagger rewritten in Python, AZLyrics instead of Genius

2023-02-03 20:59:13 +04:00 · 2023-02-03 20:59:13 +04:00 · ed775781a3
commit ed775781a3
parent 902b6414a5
5 changed files with 408 additions and 82 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,4 @@ files/
 convert/
 tagged/
 .vscode/
 lyrics.txt
--- a/.id3tag_helper.py
+++ b/.id3tag_helper.py
@ -0,0 +1,403 @@
 #!/usr/bin/env python3
 import os
 import sys
 import shutil
 from pathlib import Path
 import logging
 import mimetypes
 import subprocess
 from typing import TypedDict
 from typing import Optional
 import re
 import requests
 from bs4 import BeautifulSoup
 from mutagen.id3 import ID3
 from mutagen.id3 import TPE1, TIT2, TALB
 from mutagen.id3 import TYER, TRCK
 from mutagen.id3 import USLT, APIC
 BASEURL = 'https://www.azlyrics.com'
 USERAGENT = (
    'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) '
    'Gecko/20100101 Firefox/110.0'
 )
 LYRICS_ROW = '.main-page>.row>.col-xs-12'
 session = requests.Session()
 session.headers['User-Agent'] = USERAGENT
 class ParseResult(TypedDict):
    title: str
    artist: str
    album: str
    year: int
    track_no: int
    tracks: int
    lyrics: str
    cover: Optional[bytes]
    cover_mime: Optional[str]
 class ParseError(Exception):
    def __init__(self, parsing_obj: str) -> None:
        super().__init__(
            f'Unable to parse {parsing_obj}'
        )
 def main() -> None:
    copy = int(sys.argv[1]) == 1
    file = sys.argv[2]
    title = conv_title(file)
    print(
        'Enter new title to correct it, '
        'or press Enter to continue',
        '"!--" without quotes means that '
        'you want to enter info and lyrics manually',
        sep='\n',
    )
    print('Title:', title)
    correct = input().strip()
    parsed: Optional[ParseResult] = None
    if correct == '!--':
        parsed = manual_info_input()
    else:
        if correct != '':
            title = correct.lower()
        url = search_azurl(title)
        print(url)
        parsed = parse_azlyrics(url)
    #print(parsed)
    tagmp3(file, parsed, copy)
 def input_num(msg: str, def_: int = 0) -> int:
    try:
        return int(input(msg))
    except ValueError:
        return def_
 def conv_title(file: str) -> str:
    # Remove file path
    title = file \
        .replace('./convert/', '') \
        .replace('./files/', '')
    # Remove a YT ID and an extension
    title = re.sub(
        r'-{3}[\w_-]*\.[\w_-]*',
        '', title,
    )
    # Remove "(Official Audio)"
    title = re.sub(
        r'\(.*\)',
        '', title,
    )
    # underscore -> space
    title = title \
        .replace('_', ' ') \
        .strip() \
        .lower()
    return title
 def search_azurl(title: str) -> str:
    print('Searching...')
    page = session.get(
        'https://searx.dc09.ru/search',
        params={
            'q': f'{title} site:azlyrics.com',
            'category_general': 1,
            'language': 'ru-RU',
            'time_range': '',
            'safesearch': 0,
            'theme': 'simple',
        },
    )
    soup = BeautifulSoup(page.text, 'html.parser')
    link = soup.select_one(
        'div#urls>article>h3>a[href*="azlyrics.com/lyrics/"]'
    )
    if link is None:
        raise ParseError('song URL')
    return str(link.get('href'))
 def parse_azlyrics(link: str) -> ParseResult:
    result = ParseResult(
        title='', artist='',
        album='', year=0,
        track_no=0, tracks=0,
        lyrics='',
        cover=None,
        cover_mime=None,
    )
    print('Please wait...')
    page = session.get(link)
    soup = BeautifulSoup(page.text, 'html.parser')
    lyrics = soup.select_one(
        f'{LYRICS_ROW}>div'
        ':not(.div-share)'
        ':not(.lyricsh)'
        ':not(.ringtone)'
    )
    if lyrics is None:
        raise ParseError('song lyrics')
    result['lyrics'] = lyrics.get_text().strip()
    artist_elem = soup.select_one(f'{LYRICS_ROW}>.lyricsh>h2')
    if artist_elem is None:
        print('Unable to parse artist name')
        result['artist'] = input('Enter the artist name: ')
    else:
        result['artist'] = artist_elem.get_text() \
            .removesuffix(' Lyrics') \
            .strip()
    title_elem = soup.select_one(f'{LYRICS_ROW}>b')
    if title_elem is None:
        print('Unable to parse song title')
        result['title'] = input('Enter the title: ')
    else:
        result['title'] = title_elem.get_text().strip('" ')
    album_blocks = soup.select('.songinalbum_title')
    album = None
    if len(album_blocks) > 1:
        album = album_blocks[-2]
    elif len(album_blocks) > 0:
        album = album_blocks[0]
    if album is None:
        album_re = None
    else:
        album_re = re.search(
            r'album:\s*"(.+?)"\s*\((\d+)\)',
            album.get_text()
        )
    if album_re is None:
        print('Unable to parse album name')
        result['album'] = input('Enter the album name: ')
        result['year'] = input_num('Enter the release year: ')
        result['track_no'] = input_num('This is the track #')
        result['tracks'] = input_num('Number of tracks in the album: ')
        cover = input('Insert an album cover? [Y/n] ')
        if cover.lower() not in ('n','н'):
            try:
                print(
                    'Download the cover and enter its path:',
                    '(relative path is not recommended)',
                    sep='\n',
                )
                cover_file = Path(input().strip())
                with cover_file.open('rb') as f:
                    result['cover'] = f.read()
                result['cover_mime'] = (
                    mimetypes.guess_type(cover_file)[0]
                    or 'image/jpeg'
                )
            except Exception as err:
                logging.exception(err)
    else:
        result['album'] = album_re[1]
        result['year'] = int(album_re[2])
        assert album is not None
        cover = album.select_one('img.album-image')
        if cover is not None:
            cover_url = str(cover.get('src'))
            if cover_url.startswith('/'):
                cover_url = BASEURL + cover_url
            req = session.get(cover_url)
            result['cover'] = req.content
            result['cover_mime'] = req.headers.get(
                'Content-Type', 'image/jpeg'
            )
        tracklist_elem = soup.select_one('.songlist-panel')
        if tracklist_elem is not None:
            tracklist = tracklist_elem.select(
                '.listalbum-item'
            )
            result['tracks'] = len(tracklist)
            current_url = re.search(
                r'/(lyrics/.+?\.html)',
                link,
            )
            result['track_no'] = 0
            if current_url is not None:
                for i, track in enumerate(tracklist):
                    track_url = track.select_one('a')
                    if track_url is None:
                        continue
                    track_href = str(track_url.get('href'))
                    if current_url[0] in track_href:
                        result['track_no'] = (i + 1)
                        break
    return result
 def manual_info_input() -> ParseResult:
    result = ParseResult(
        title=input('Song title: '),
        artist=input('Artist name: '),
        album=input('Album name: '),
        year=input_num('Release year: '),
        track_no=input_num('Track #'),
        tracks=input_num('Tracks in album: '),
        lyrics='', cover=None, cover_mime=None,
    )
    editor = os.getenv('EDITOR', 'nano')
    print('Now, paste the lyrics into a text editor')
    print(f'Default editor: {editor}')
    print('Enter another or press Enter to continue')
    other_editor = input().strip()
    if other_editor != '':
        editor = other_editor
    try:
        lyrics_file = Path('.') / 'lyrics.txt'
        with lyrics_file.open('wt') as f:
            f.write('\n')
        subprocess.call([
            editor,
            lyrics_file,
        ])
        print('Reading file...')
        with open('lyrics.txt', 'rt', encoding='utf-8') as f:
            result['lyrics'] = f.read().strip()
        print('Done')
    except OSError as err:
        logging.exception(err)
    cover = input('Insert an album cover? [Y/n] ')
    if cover.lower() not in ('n','н'):
        try:
            print(
                'Download the cover and enter its path:',
                '(relative path is not recommended)',
                sep='\n',
            )
            cover_file = Path(input().strip())
            with cover_file.open('rb') as f:
                result['cover'] = f.read()
            result['cover_mime'] = (
                mimetypes.guess_type(cover_file)[0]
                or 'image/jpeg'
            )
        except Exception as err:
            logging.exception(err)
    return result
 def tagmp3(
        file: str,
        parsed: ParseResult,
        copy: bool) -> None:
    oldpath = Path(file)
    newpath = oldpath
    if copy:
        newdir = (
            Path('./tagged') /
            parsed['artist'] /
            parsed['album']
        )
        os.makedirs(newdir, exist_ok=True)
        newpath = newdir / (
            f"{parsed['track_no']}. " +
            f"{parsed['title']}.mp3"
        )
        shutil.copy(oldpath, newpath)
        if parsed['cover'] is not None:
            ext = mimetypes.guess_extension(
                parsed['cover_mime'] or ''
            ) or '.jpg'
            cover = newdir / f'cover{ext}'
            with cover.open('wb') as f:
                f.write(parsed['cover'])
    id3 = ID3(str(newpath))
    id3['TPE1'] = TPE1(text=parsed['artist'])
    id3['TIT2'] = TIT2(text=parsed['title'])
    id3['TALB'] = TALB(text=parsed['album'])
    id3['TYER'] = TYER(text=f"{parsed['year']}")
    id3['TRCK'] = TRCK(
        text=(
            f"{parsed['track_no']}/"
            f"{parsed['tracks']}"
        )
    )
    id3['USLT'] = USLT(text=parsed['lyrics'])
    if parsed['cover'] is not None:
        id3['APIC'] = APIC(
            data=parsed['cover'],
            mime=parsed['cover_mime'],
        )
    id3.save()
 if __name__ == '__main__':
    main()
--- a/.id3tag_helper.sh
+++ b/.id3tag_helper.sh
@ -1,81 +0,0 @@
 #!/usr/bin/env bash
 # Should not be called manually
 conv_title () {
    echo "$1" | \
    # remove directory name
    sed -E 's#\./(convert|files)/##' | \
    # remove video ID and ext
    sed 's/---[A-Za-z0-9_-]*\.[A-Za-z0-9]*//' | \
    # underscore -> space
    sed 's/_/ /g' | \
    # remove "(Official Audio)"
    sed -E 's/\(.*\)//' | \
    # trim spaces
    xargs
 }
 rm_quotes () {
    cat | sed 's/^"//' | sed 's/"$//'
 }
 title=$(conv_title "$2")
 title_prev="$title"
 echo 'Correct the song title if needed:'
 read -e -r -i "$title" title
 title=${title:-$title_prev}
 echo 'Searching on Genius'
 link="https://genius.com/api/search/multi?q=${title// /%20}"
 echo "URL: $link"
 song=$(curl -sL "$link" | jq '.response.sections[1].hits[0].result')
 title=$(echo "$song" | jq '.title' | rm_quotes)
 artist=$(echo "$song" | jq '.primary_artist.name' | rm_quotes)
 year=$(echo "$song" | jq '.release_date_components.year')
 page_url=$(echo "$song" | jq '.url' | rm_quotes)
 echo "Title: $title"
 echo "Artist: $artist"
 echo "Lyrics: $page_url"
 echo 'Parsing lyrics page'
 page=$(curl -sL "$page_url")
 album=$(echo "$page" | pup -p 'a[class^="PrimaryAlbum__Title"] text{}' | sed -E 's#\([0-9]+\)$##' | xargs)
 tracknum=$(echo "$page" | pup -p 'div[class^="HeaderTracklist__AlbumWrapper"]' | grep -oE 'Track [0-9]+' | grep -oE '[0-9]+')
 trackall=$(echo "$page" | pup -p 'ol[class^="AlbumTracklist__Container"] > li:last-child div[class^="AlbumTracklist__TrackNumber"] text{}' | grep -oE '[0-9]+')
 lyrics=$(echo "$page" | pup -p 'div[data-lyrics-container="true"] text{}' | sed 's#^\[#\n[#')
 # remove first blank line
 if [[ $(echo "$lyrics" | sed -n '1p') == "" ]]; then
    lyrics=$(echo "$lyrics" | sed '1d')
 fi
 echo "Album: $album"
 if [[ $1 == 1 ]]; then
    newdir="./tagged/${artist}/${album}"
    mkdir -p "$newdir"
    newpath="${newdir}/${tracknum}. ${title}.mp3"
    echo "Copying to $newpath"
    cp -f "$2" "$newpath"
 else
    newpath="$2"
 fi
 echo
 echo "$lyrics"
 echo
 echo 'Adding ID3v2 tags'
 mid3v2 \
 --artist "$artist" \
 --album "$album" \
 --song "$title" \
 --year "$year" \
 --track "${tracknum}/${trackall}" \
 --USLT "$lyrics" \
 "$newpath"
--- a/id3tag.sh
+++ b/id3tag.sh
@ -19,4 +19,4 @@ copy_arg=$(ask)
 echo
 find "$directory" -type f -name "*.mp3" -exec \
-bash ./.id3tag_helper.sh "$copy_arg" {} \;
+python3 ./.id3tag_helper.py "$copy_arg" {} \;
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
 mutagen
 requests
 bs4