mirror of
https://github.com/rramiachraf/dumb.git
synced 2025-04-04 13:27:36 +03:00
feat: extract json from body instead of scraping by elements
This commit is contained in:
parent
c03bd495f6
commit
5898340920
5 changed files with 98 additions and 92 deletions
9
go.mod
9
go.mod
|
@ -3,14 +3,11 @@ module github.com/rramiachraf/dumb
|
|||
go 1.18
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.8.0
|
||||
github.com/allegro/bigcache/v3 v3.0.2
|
||||
github.com/gorilla/mux v1.8.0
|
||||
github.com/russross/blackfriday/v2 v2.1.0
|
||||
github.com/sirupsen/logrus v1.9.0
|
||||
github.com/valyala/fastjson v1.6.4
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/andybalholm/cascadia v1.3.1 // indirect
|
||||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 // indirect
|
||||
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8 // indirect
|
||||
)
|
||||
require golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8 // indirect
|
||||
|
|
15
go.sum
15
go.sum
|
@ -1,9 +1,5 @@
|
|||
github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
|
||||
github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
|
||||
github.com/allegro/bigcache/v3 v3.0.2 h1:AKZCw+5eAaVyNTBmI2fgyPVJhHkdWder3O9IrprcQfI=
|
||||
github.com/allegro/bigcache/v3 v3.0.2/go.mod h1:aPyh7jEvrog9zAwx5N7+JUQX5dZTSGpxF1LAR4dr35I=
|
||||
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
|
||||
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
|
@ -11,20 +7,17 @@ github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI=
|
|||
github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
|
||||
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
|
||||
github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0=
|
||||
github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
|
||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 h1:/6y1LfuqNuQdHAm0jjtPtgRcxIxjVZgm5OTu8/QhZvk=
|
||||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
github.com/valyala/fastjson v1.6.4 h1:uAUNq9Z6ymTgGhcm0UynUAB6tlbakBrz6CQFax3BXVQ=
|
||||
github.com/valyala/fastjson v1.6.4/go.mod h1:CLCAqky6SMuOcxStkYQvblddUtoRxhYMGLrsQns1aXY=
|
||||
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8 h1:0A+M6Uqn+Eje4kHMK80dtF3JCXC4ykBgQG4Fe06QRhQ=
|
||||
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
|
|
136
lyrics.go
136
lyrics.go
|
@ -1,74 +1,92 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/russross/blackfriday/v2"
|
||||
"github.com/valyala/fastjson"
|
||||
)
|
||||
|
||||
type song struct {
|
||||
Artist string
|
||||
Title string
|
||||
Image string
|
||||
Lyrics string
|
||||
Credits map[string]string
|
||||
About [2]string
|
||||
Artist string
|
||||
Title string
|
||||
Image string
|
||||
Lyrics string
|
||||
Credits map[string]string
|
||||
About [2]string
|
||||
PrimaryColor string
|
||||
}
|
||||
|
||||
func (s *song) parseLyrics(doc *goquery.Document) {
|
||||
doc.Find("[data-lyrics-container='true']").Each(func(i int, ss *goquery.Selection) {
|
||||
h, err := ss.Html()
|
||||
if err != nil {
|
||||
logger.Errorln("unable to parse lyrics", err)
|
||||
}
|
||||
s.Lyrics += h
|
||||
})
|
||||
}
|
||||
|
||||
func (s *song) parseMetadata(doc *goquery.Document) {
|
||||
artist := doc.Find("a[class*='Artist']").First().Text()
|
||||
title := doc.Find("h1[class*='Title']").First().Text()
|
||||
image, exists := doc.Find("meta[property='og:image']").Attr("content")
|
||||
if exists {
|
||||
if u, err := url.Parse(image); err == nil {
|
||||
s.Image = fmt.Sprintf("/images%s", u.Path)
|
||||
}
|
||||
func fixJSON(in []byte) []byte {
|
||||
var out = in
|
||||
replaceList := map[string]string{
|
||||
`{\"`: `{"`,
|
||||
`\":`: `":`,
|
||||
`:\"`: `:"`,
|
||||
`\",`: `",`,
|
||||
`,\"`: `,"`,
|
||||
`\"}`: `"}`,
|
||||
`[\"`: `["`,
|
||||
`\"],`: `"],`,
|
||||
`\"]}`: `"]}`,
|
||||
`\\n`: ``,
|
||||
`\'`: `'`,
|
||||
`\\"`: `"`,
|
||||
}
|
||||
|
||||
s.Title = title
|
||||
s.Artist = artist
|
||||
}
|
||||
|
||||
func (s *song) parseCredits(doc *goquery.Document) {
|
||||
credits := make(map[string]string)
|
||||
|
||||
doc.Find("[class*='SongInfo__Credit']").Each(func(i int, ss *goquery.Selection) {
|
||||
key := ss.Children().First().Text()
|
||||
value := ss.Children().Last().Text()
|
||||
credits[key] = value
|
||||
})
|
||||
|
||||
s.Credits = credits
|
||||
}
|
||||
|
||||
func (s *song) parseAbout(doc *goquery.Document) {
|
||||
s.About[0] = doc.Find("[class*='SongDescription__Content']").Text()
|
||||
summary := strings.Split(s.About[0], "")
|
||||
|
||||
if len(summary) > 250 {
|
||||
s.About[1] = strings.Join(summary[0:250], "") + "..."
|
||||
for match, replacer := range replaceList {
|
||||
out = bytes.ReplaceAll(out, []byte(match), []byte(replacer))
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *song) parse(doc *goquery.Document) {
|
||||
s.parseLyrics(doc)
|
||||
s.parseMetadata(doc)
|
||||
s.parseCredits(doc)
|
||||
s.parseAbout(doc)
|
||||
func (s *song) parse(urlPath string, preload []byte) {
|
||||
jsonData := fixJSON(preload)
|
||||
|
||||
var parser fastjson.Parser
|
||||
|
||||
v, err := parser.Parse(string(jsonData))
|
||||
if err != nil {
|
||||
logger.Errorf(`%s: %s\n`, urlPath, err)
|
||||
}
|
||||
|
||||
v.Del("currentPage")
|
||||
v.Del("deviceType")
|
||||
v.Del("session")
|
||||
|
||||
s.Lyrics = string(v.GetStringBytes("songPage", "lyricsData", "body", "html"))
|
||||
s.Credits = make(map[string]string)
|
||||
|
||||
v.GetObject("entities", "songs").Visit(func(key []byte, v *fastjson.Value) {
|
||||
path := strings.ToLower(string(v.GetStringBytes("path")))
|
||||
if path == urlPath {
|
||||
s.Title = string(v.GetStringBytes("title"))
|
||||
s.Artist = string(v.GetStringBytes("artistNames"))
|
||||
s.About[0] = string(blackfriday.Run(v.GetStringBytes("description", "markdown")))
|
||||
s.About[1] = string(v.GetStringBytes("descriptionPreview"))
|
||||
if u, err := url.Parse(string(v.GetStringBytes("songArtImageUrl"))); err == nil {
|
||||
s.Image = fmt.Sprintf("/images%s", u.Path)
|
||||
}
|
||||
s.PrimaryColor = string(v.GetStringBytes("songArtPrimaryColor"))
|
||||
|
||||
for _, v := range v.GetArray("customPerformances") {
|
||||
label := v.GetStringBytes("label")
|
||||
var artists []string
|
||||
for _, v := range v.GetArray("artists") {
|
||||
artists = append(artists, string(v.GetStringBytes("name")))
|
||||
}
|
||||
s.Credits[string(label)] = strings.Join(artists, ", ")
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func lyricsHandler(w http.ResponseWriter, r *http.Request) {
|
||||
|
@ -100,19 +118,17 @@ func lyricsHandler(w http.ResponseWriter, r *http.Request) {
|
|||
return
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
bodyHTML, err := io.ReadAll(resp.Body)
|
||||
|
||||
rgx, err := regexp.Compile(`window\.__PRELOADED_STATE__ = JSON.parse\('(.*)'\);`)
|
||||
if err != nil {
|
||||
logger.Errorln(err)
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
render("error", w, map[string]string{
|
||||
"Status": "500",
|
||||
"Error": "something went wrong",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
preload := rgx.FindSubmatch(bodyHTML)[1]
|
||||
|
||||
var s song
|
||||
s.parse(doc)
|
||||
s.parse(r.URL.RequestURI(), preload)
|
||||
|
||||
render("lyrics", w, s)
|
||||
setCache(id, s)
|
||||
|
|
|
@ -6,7 +6,7 @@ function showAbout() {
|
|||
fullAbout.classList.toggle("hidden")
|
||||
}
|
||||
|
||||
[fullAbout, summary].forEach(item => item.onclick = showAbout)
|
||||
//[fullAbout, summary].forEach(item => item.onclick = showAbout)
|
||||
|
||||
document.querySelectorAll("#lyrics a").forEach(item => {
|
||||
item.addEventListener("click", getAnnotation)
|
||||
|
|
|
@ -1,38 +1,38 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>{{.Artist}} - {{.Title}} lyrics</title>
|
||||
<title>{{ printf "%s - %s" .Artist .Title }} Lyrics</title>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<link rel="stylesheet" type="text/css" href="/static/style.css" />
|
||||
<script type="text/javascript" src="/static/script.js" defer></script>
|
||||
</head>
|
||||
<body>
|
||||
{{template "navbar"}}
|
||||
{{ template "navbar" }}
|
||||
<div id="container">
|
||||
<div id="metadata">
|
||||
<img src="{{.Image}}"/>
|
||||
<h2>{{.Artist}}</h2>
|
||||
<h1>{{.Title}}</h1>
|
||||
<h2>{{ .Artist }}</h2>
|
||||
<h1>{{ .Title }}</h1>
|
||||
</div>
|
||||
<div id="lyrics">{{.Lyrics}}</div>
|
||||
<div id="lyrics">{{ .Lyrics }}</div>
|
||||
<div id="info">
|
||||
<div id="about">
|
||||
<!--div id="about">
|
||||
<h1 id="title">About</h1>
|
||||
<p class="hidden" id="full_about">{{index .About 0}}</p>
|
||||
<p id="summary">{{index .About 1}}</p>
|
||||
</div>
|
||||
<div class="hidden" id="full_about">{{ index .About 0 }}</div>
|
||||
<p id="summary">{{ index .About 1 }}</p>
|
||||
</div-->
|
||||
<div id="credits">
|
||||
<h1 id="title">Credits</h1>
|
||||
{{range $key, $val := .Credits}}
|
||||
{{ range $key, $val := .Credits }}
|
||||
<details>
|
||||
<summary>{{$key}}</summary>
|
||||
<p>{{$val}}</p>
|
||||
<summary>{{ $key }}</summary>
|
||||
<p>{{ $val }}</p>
|
||||
</details>
|
||||
{{end}}
|
||||
{{ end }}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{{template "footer"}}
|
||||
{{ template "footer" }}
|
||||
</body>
|
||||
</html>
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue