feat: extract json from body instead of scraping by elements

This commit is contained in:
rramiachraf 2023-01-28 16:18:31 +01:00
parent c03bd495f6
commit 5898340920
5 changed files with 98 additions and 92 deletions

9
go.mod
View file

@ -3,14 +3,11 @@ module github.com/rramiachraf/dumb
go 1.18
require (
github.com/PuerkitoBio/goquery v1.8.0
github.com/allegro/bigcache/v3 v3.0.2
github.com/gorilla/mux v1.8.0
github.com/russross/blackfriday/v2 v2.1.0
github.com/sirupsen/logrus v1.9.0
github.com/valyala/fastjson v1.6.4
)
require (
github.com/andybalholm/cascadia v1.3.1 // indirect
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 // indirect
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8 // indirect
)
require golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8 // indirect

15
go.sum
View file

@ -1,9 +1,5 @@
github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
github.com/allegro/bigcache/v3 v3.0.2 h1:AKZCw+5eAaVyNTBmI2fgyPVJhHkdWder3O9IrprcQfI=
github.com/allegro/bigcache/v3 v3.0.2/go.mod h1:aPyh7jEvrog9zAwx5N7+JUQX5dZTSGpxF1LAR4dr35I=
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@ -11,20 +7,17 @@ github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI=
github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0=
github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 h1:/6y1LfuqNuQdHAm0jjtPtgRcxIxjVZgm5OTu8/QhZvk=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
github.com/valyala/fastjson v1.6.4 h1:uAUNq9Z6ymTgGhcm0UynUAB6tlbakBrz6CQFax3BXVQ=
github.com/valyala/fastjson v1.6.4/go.mod h1:CLCAqky6SMuOcxStkYQvblddUtoRxhYMGLrsQns1aXY=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8 h1:0A+M6Uqn+Eje4kHMK80dtF3JCXC4ykBgQG4Fe06QRhQ=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

136
lyrics.go
View file

@ -1,74 +1,92 @@
package main
import (
"bytes"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/gorilla/mux"
"github.com/russross/blackfriday/v2"
"github.com/valyala/fastjson"
)
type song struct {
Artist string
Title string
Image string
Lyrics string
Credits map[string]string
About [2]string
Artist string
Title string
Image string
Lyrics string
Credits map[string]string
About [2]string
PrimaryColor string
}
func (s *song) parseLyrics(doc *goquery.Document) {
doc.Find("[data-lyrics-container='true']").Each(func(i int, ss *goquery.Selection) {
h, err := ss.Html()
if err != nil {
logger.Errorln("unable to parse lyrics", err)
}
s.Lyrics += h
})
}
func (s *song) parseMetadata(doc *goquery.Document) {
artist := doc.Find("a[class*='Artist']").First().Text()
title := doc.Find("h1[class*='Title']").First().Text()
image, exists := doc.Find("meta[property='og:image']").Attr("content")
if exists {
if u, err := url.Parse(image); err == nil {
s.Image = fmt.Sprintf("/images%s", u.Path)
}
func fixJSON(in []byte) []byte {
var out = in
replaceList := map[string]string{
`{\"`: `{"`,
`\":`: `":`,
`:\"`: `:"`,
`\",`: `",`,
`,\"`: `,"`,
`\"}`: `"}`,
`[\"`: `["`,
`\"],`: `"],`,
`\"]}`: `"]}`,
`\\n`: ``,
`\'`: `'`,
`\\"`: `"`,
}
s.Title = title
s.Artist = artist
}
func (s *song) parseCredits(doc *goquery.Document) {
credits := make(map[string]string)
doc.Find("[class*='SongInfo__Credit']").Each(func(i int, ss *goquery.Selection) {
key := ss.Children().First().Text()
value := ss.Children().Last().Text()
credits[key] = value
})
s.Credits = credits
}
func (s *song) parseAbout(doc *goquery.Document) {
s.About[0] = doc.Find("[class*='SongDescription__Content']").Text()
summary := strings.Split(s.About[0], "")
if len(summary) > 250 {
s.About[1] = strings.Join(summary[0:250], "") + "..."
for match, replacer := range replaceList {
out = bytes.ReplaceAll(out, []byte(match), []byte(replacer))
}
return out
}
func (s *song) parse(doc *goquery.Document) {
s.parseLyrics(doc)
s.parseMetadata(doc)
s.parseCredits(doc)
s.parseAbout(doc)
func (s *song) parse(urlPath string, preload []byte) {
jsonData := fixJSON(preload)
var parser fastjson.Parser
v, err := parser.Parse(string(jsonData))
if err != nil {
logger.Errorf(`%s: %s\n`, urlPath, err)
}
v.Del("currentPage")
v.Del("deviceType")
v.Del("session")
s.Lyrics = string(v.GetStringBytes("songPage", "lyricsData", "body", "html"))
s.Credits = make(map[string]string)
v.GetObject("entities", "songs").Visit(func(key []byte, v *fastjson.Value) {
path := strings.ToLower(string(v.GetStringBytes("path")))
if path == urlPath {
s.Title = string(v.GetStringBytes("title"))
s.Artist = string(v.GetStringBytes("artistNames"))
s.About[0] = string(blackfriday.Run(v.GetStringBytes("description", "markdown")))
s.About[1] = string(v.GetStringBytes("descriptionPreview"))
if u, err := url.Parse(string(v.GetStringBytes("songArtImageUrl"))); err == nil {
s.Image = fmt.Sprintf("/images%s", u.Path)
}
s.PrimaryColor = string(v.GetStringBytes("songArtPrimaryColor"))
for _, v := range v.GetArray("customPerformances") {
label := v.GetStringBytes("label")
var artists []string
for _, v := range v.GetArray("artists") {
artists = append(artists, string(v.GetStringBytes("name")))
}
s.Credits[string(label)] = strings.Join(artists, ", ")
}
}
})
}
func lyricsHandler(w http.ResponseWriter, r *http.Request) {
@ -100,19 +118,17 @@ func lyricsHandler(w http.ResponseWriter, r *http.Request) {
return
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
bodyHTML, err := io.ReadAll(resp.Body)
rgx, err := regexp.Compile(`window\.__PRELOADED_STATE__ = JSON.parse\('(.*)'\);`)
if err != nil {
logger.Errorln(err)
w.WriteHeader(http.StatusInternalServerError)
render("error", w, map[string]string{
"Status": "500",
"Error": "something went wrong",
})
return
}
preload := rgx.FindSubmatch(bodyHTML)[1]
var s song
s.parse(doc)
s.parse(r.URL.RequestURI(), preload)
render("lyrics", w, s)
setCache(id, s)

View file

@ -6,7 +6,7 @@ function showAbout() {
fullAbout.classList.toggle("hidden")
}
[fullAbout, summary].forEach(item => item.onclick = showAbout)
//[fullAbout, summary].forEach(item => item.onclick = showAbout)
document.querySelectorAll("#lyrics a").forEach(item => {
item.addEventListener("click", getAnnotation)

View file

@ -1,38 +1,38 @@
<!DOCTYPE html>
<html>
<head>
<title>{{.Artist}} - {{.Title}} lyrics</title>
<title>{{ printf "%s - %s" .Artist .Title }} Lyrics</title>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link rel="stylesheet" type="text/css" href="/static/style.css" />
<script type="text/javascript" src="/static/script.js" defer></script>
</head>
<body>
{{template "navbar"}}
{{ template "navbar" }}
<div id="container">
<div id="metadata">
<img src="{{.Image}}"/>
<h2>{{.Artist}}</h2>
<h1>{{.Title}}</h1>
<h2>{{ .Artist }}</h2>
<h1>{{ .Title }}</h1>
</div>
<div id="lyrics">{{.Lyrics}}</div>
<div id="lyrics">{{ .Lyrics }}</div>
<div id="info">
<div id="about">
<!--div id="about">
<h1 id="title">About</h1>
<p class="hidden" id="full_about">{{index .About 0}}</p>
<p id="summary">{{index .About 1}}</p>
</div>
<div class="hidden" id="full_about">{{ index .About 0 }}</div>
<p id="summary">{{ index .About 1 }}</p>
</div-->
<div id="credits">
<h1 id="title">Credits</h1>
{{range $key, $val := .Credits}}
{{ range $key, $val := .Credits }}
<details>
<summary>{{$key}}</summary>
<p>{{$val}}</p>
<summary>{{ $key }}</summary>
<p>{{ $val }}</p>
</details>
{{end}}
{{ end }}
</div>
</div>
</div>
{{template "footer"}}
{{ template "footer" }}
</body>
</html>