649 lines
14 KiB
Go
649 lines
14 KiB
Go
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"errors"
|
|
"html"
|
|
"net/url"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/jbowtie/gokogiri"
|
|
gokogirixml "github.com/jbowtie/gokogiri/xml"
|
|
"github.com/lunny/html2md"
|
|
"github.com/microcosm-cc/bluemonday"
|
|
"mvdan.cc/xurls/v2"
|
|
)
|
|
|
|
//FBPostData FBPostData
|
|
type FBPostData struct {
|
|
PostURL string
|
|
TimeStamp string
|
|
ProfileLink *ProfileLink
|
|
GiftURL string
|
|
ImageURL string
|
|
Content string
|
|
Summary string
|
|
Title string
|
|
Author string
|
|
Tags string
|
|
}
|
|
|
|
// ParsePost ParsePost
|
|
func ParsePost(s, PostURL string) (*FBPostData, error) {
|
|
fb := FBPostData{PostURL: PostURL}
|
|
|
|
docKogiri, err := gokogiri.ParseHtml([]byte(s))
|
|
if err != nil {
|
|
return &fb, err
|
|
}
|
|
defer docKogiri.Free()
|
|
htmlNode := docKogiri.Root().FirstChild()
|
|
|
|
//doc, err := goquery.NewDocumentFromReader(strings.NewReader(s))
|
|
//if err != nil {
|
|
// return &fb, err
|
|
//}
|
|
|
|
fb.TimeStamp, err = GetTimeStamp(htmlNode)
|
|
if err != nil {
|
|
return &fb, err
|
|
}
|
|
|
|
//fmt.Printf("\n\n%#v\n\n", dataFT)
|
|
|
|
//fb.ProfileLink, err = GetProfileLink(doc)
|
|
//if err != nil {
|
|
// return &fb, err
|
|
//}
|
|
|
|
fb.ImageURL, err = GetImageURL(htmlNode)
|
|
if err != nil {
|
|
return &fb, err
|
|
}
|
|
|
|
fb.GiftURL, err = GetGiftURL(htmlNode)
|
|
if err != nil {
|
|
return &fb, err
|
|
}
|
|
|
|
fb.PostURL, err = GetPostURL(htmlNode)
|
|
if err != nil {
|
|
fb.PostURL = PostURL
|
|
return &fb, err
|
|
}
|
|
|
|
fb.Content, err = GetContent(htmlNode)
|
|
if err != nil {
|
|
return &fb, err
|
|
}
|
|
|
|
if strings.Contains(fb.Content, FreeSilverCaskets) {
|
|
if len(fb.Title) == 0 {
|
|
fb.Title = FreeSilverCaskets
|
|
} else {
|
|
fb.Title = FreeSilverCaskets + "+" + fb.Title
|
|
}
|
|
}
|
|
|
|
if strings.Contains(fb.Content, FreeTitanArtifact) {
|
|
if len(fb.Title) == 0 {
|
|
fb.Title = FreeTitanArtifact
|
|
} else {
|
|
fb.Title = FreeTitanArtifact + "+" + fb.Title
|
|
}
|
|
}
|
|
|
|
if strings.Contains(fb.Content, FreeSoulStones) {
|
|
if len(fb.Title) == 0 {
|
|
fb.Title = FreeSoulStones
|
|
} else {
|
|
fb.Title = FreeSoulStones + "+" + fb.Title
|
|
}
|
|
}
|
|
|
|
if strings.Contains(fb.Content, FreeSkinStones) {
|
|
if len(fb.Title) == 0 {
|
|
fb.Title = FreeSkinStones
|
|
} else {
|
|
fb.Title = FreeSkinStones + "+" + fb.Title
|
|
}
|
|
}
|
|
|
|
if strings.Contains(fb.Content, ActionKeepTheAmount) {
|
|
if len(fb.Title) == 0 {
|
|
fb.Title = ActionKeepTheAmount
|
|
} else {
|
|
fb.Title = ActionKeepTheAmount + "+" + fb.Title
|
|
}
|
|
fb.GiftURL = fb.PostURL
|
|
}
|
|
|
|
if strings.Contains(fb.Content, FreeWinterfestBaubles) {
|
|
if len(fb.Title) == 0 {
|
|
fb.Title = FreeWinterfestBaubles
|
|
} else {
|
|
fb.Title = FreeWinterfestBaubles + "+" + fb.Title
|
|
}
|
|
}
|
|
|
|
if strings.Contains(fb.Content, FreeTopFanPackage) {
|
|
if len(fb.Title) == 0 {
|
|
fb.Title = FreeTopFanPackage
|
|
} else {
|
|
fb.Title = FreeTopFanPackage + "+" + fb.Title
|
|
}
|
|
}
|
|
|
|
if strings.Contains(fb.Content, FreeEnergyForFee) {
|
|
if len(fb.Title) == 0 {
|
|
fb.Title = FreeEnergyForFee
|
|
} else {
|
|
fb.Title = FreeEnergyForFee + "+" + fb.Title
|
|
}
|
|
}
|
|
|
|
if strings.Contains(fb.Content, WinterfestRankingRewards) {
|
|
if len(fb.Title) == 0 {
|
|
fb.Title = WinterfestRankingRewards
|
|
} else {
|
|
fb.Title = WinterfestRankingRewards + "+" + fb.Title
|
|
}
|
|
}
|
|
|
|
if len(fb.Title) == 0 {
|
|
fb.Title = "unknown - need to implemented"
|
|
}
|
|
defer func() {
|
|
dataFT = nil
|
|
}()
|
|
|
|
return &fb, nil
|
|
}
|
|
|
|
// Parse Parse
|
|
func Parse(url string) (*FBPostData, error) {
|
|
doc, err := goquery.NewDocument(url)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if strings.Contains(url, ".blogspot.") {
|
|
return ParseBlogspotPost(doc)
|
|
}
|
|
|
|
// If not login, post looks like
|
|
// <div class="hidden_elem"><code id="u_0_p"><!-- ... --></code></div>
|
|
s := QuerySelector(doc, "div.hidden_elem > code")
|
|
cmt, err := s.Html()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if len(cmt) != 0 {
|
|
return ParsePost(cmt, url)
|
|
}
|
|
|
|
s = QuerySelector(doc, "div._427x")
|
|
cmt, err = s.Html()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return ParsePost(cmt, url)
|
|
}
|
|
|
|
// ParseAll ParseAll
|
|
func ParseAll(url string) ([]*FBPostData, error) {
|
|
doc, err := goquery.NewDocument(url)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
allFbPosts := []*FBPostData{}
|
|
|
|
QuerySelectorEach(doc, "div._427x", func(i int, selected *goquery.Selection) {
|
|
cmt, err := selected.Html()
|
|
if err != nil {
|
|
return
|
|
}
|
|
fbPost, err := ParsePost(cmt, url)
|
|
if err != nil {
|
|
return
|
|
}
|
|
allFbPosts = append(allFbPosts, fbPost)
|
|
})
|
|
|
|
return allFbPosts, nil
|
|
}
|
|
|
|
// GetContent GetContent
|
|
func GetContent(htmlNode gokogirixml.Node) (string, error) {
|
|
results, err := htmlNode.Search(xPathArticleContent)
|
|
if err != nil {
|
|
//fmt.Printf("ERR: %#v -- %#v\n", results, err)
|
|
return "", err
|
|
}
|
|
//fmt.Printf("ERR: %#v -- %#v\n", results, err)
|
|
if len(results) > 0 {
|
|
if resultHTML := results[0].InnerHtml(); resultHTML != "" {
|
|
|
|
bmsanizer := bluemonday.StrictPolicy()
|
|
bmsanizer.AllowAttrs("href").OnElements("a")
|
|
bmsanizer.AllowElements("p")
|
|
bmsanizer.RequireParseableURLs(true)
|
|
htmlText := bmsanizer.SanitizeBytes([]byte(resultHTML))
|
|
|
|
content := strings.Join(strings.Fields(string(htmlText)), " ")
|
|
content = html2md.Convert(content)
|
|
content = html.UnescapeString(content)
|
|
content = strings.TrimSpace(content)
|
|
|
|
return content, nil
|
|
}
|
|
}
|
|
return "", errors.New("no content found")
|
|
}
|
|
|
|
// GetImageURL GetImageURL
|
|
func GetImageURL(htmlNode gokogirixml.Node) (string, error) {
|
|
|
|
results, err := htmlNode.Search(xPathImageURL)
|
|
if err != nil {
|
|
//fmt.Printf("ERR: %#v -- %#v\n", results, err)
|
|
return "", err
|
|
}
|
|
//fmt.Printf("RESULT: %#v -- %#v\n", results, err)
|
|
// search for the second possible image
|
|
if len(results) == 0 {
|
|
results, err = htmlNode.Search(xPathImageURL2)
|
|
if err != nil {
|
|
//fmt.Printf("ERR2: %#v -- %#v\n", results, err)
|
|
return "", err
|
|
}
|
|
}
|
|
// Video Preview
|
|
if len(results) == 0 {
|
|
results, err = htmlNode.Search(xPathImageURL3)
|
|
if err != nil {
|
|
//fmt.Printf("ERR2: %#v -- %#v\n", results, err)
|
|
return "", err
|
|
}
|
|
}
|
|
//fmt.Printf("RESULT2: %#v -- %#v\n", results, err)
|
|
if len(results) > 0 {
|
|
if attrib := results[0].Attribute("style"); attrib != nil {
|
|
styleContent := attrib.Value()
|
|
codedImageURL := regexStyleImage.FindString(styleContent)
|
|
genURL := strings.ReplaceAll(codedImageURL, `\3a `, `:`)
|
|
genURL = strings.ReplaceAll(genURL, `\3d `, `=`)
|
|
genURL = strings.ReplaceAll(genURL, `\26 `, `&`)
|
|
genURL = strings.ReplaceAll(genURL, `\25 `, `%`)
|
|
genURL = strings.TrimPrefix(genURL, `url('`)
|
|
genURL = strings.TrimSuffix(genURL, `');`)
|
|
return genURL, nil
|
|
|
|
}
|
|
}
|
|
return "", errors.New("cannot find image url")
|
|
//s := QuerySelector(doc, "img.scaledImageFitHeight")
|
|
//if s.Length() == 0 {
|
|
// s = QuerySelector(doc, "img.scaledImageFitWidth")
|
|
//}
|
|
//
|
|
//url, ok := s.Attr("src")
|
|
//if !ok {
|
|
// return "", errors.New("cannot find image url")
|
|
//}
|
|
//
|
|
//return url, nil
|
|
}
|
|
|
|
// GetGiftURL GetGiftURL
|
|
func GetGiftURL(htmlNode gokogirixml.Node) (string, error) {
|
|
|
|
//s := QuerySelector(doc, "div._6ks > a")
|
|
//
|
|
//url, ok := s.Attr("href")
|
|
|
|
results, err := htmlNode.Search(xPathGiftURL)
|
|
if err != nil {
|
|
// search for the second possible image
|
|
results, err = htmlNode.Search(xPathImageURL2)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
}
|
|
|
|
var genURL string
|
|
var attrib *gokogirixml.AttributeNode
|
|
if len(results) > 0 {
|
|
attrib = results[0].Attribute("href")
|
|
if attrib != nil {
|
|
tmpGiftURL := attrib.Value()
|
|
tmpGiftURL = html.UnescapeString(tmpGiftURL)
|
|
tmpGiftURL2, err := url.Parse(tmpGiftURL)
|
|
if err != nil {
|
|
attrib = nil
|
|
goto nextGiftLinkChecker
|
|
}
|
|
|
|
//fmt.Printf("%#v\n", tmpGiftURL2)
|
|
|
|
newQuery := url.Values{}
|
|
oldQuery := tmpGiftURL2.Query()
|
|
newQuery.Set("nx_source", oldQuery.Get("nx_source"))
|
|
newQuery.Set("gift_id", oldQuery.Get("gift_id"))
|
|
|
|
tmpURLGen := url.URL{
|
|
Scheme: tmpGiftURL2.Scheme,
|
|
Host: tmpGiftURL2.Host,
|
|
Path: tmpGiftURL2.Path,
|
|
RawQuery: newQuery.Encode(),
|
|
}
|
|
if tmpURLGen.Scheme != "https" {
|
|
tmpURLGen.Scheme = "https"
|
|
}
|
|
genURL = tmpURLGen.String()
|
|
}
|
|
}
|
|
nextGiftLinkChecker:
|
|
if attrib == nil {
|
|
results, err := htmlNode.Search(xPathArticleContent)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if len(results) > 0 {
|
|
//TODO: Fix and check all URLs
|
|
rxRelaxed := xurls.Relaxed()
|
|
genURL = rxRelaxed.FindString(results[0].Content())
|
|
}
|
|
}
|
|
|
|
if len(genURL) == 0 {
|
|
return "", errors.New("cannot find gift url")
|
|
}
|
|
//fmt.Printf("%#v\n", genURL)
|
|
resultURL, err := ExpandURL2(genURL, fbGameURL)
|
|
if err != nil {
|
|
return "", errors.New("cannot find gift url - ExpandURL2")
|
|
}
|
|
|
|
return resultURL, nil
|
|
}
|
|
|
|
// GetPostURL GetPostURL
|
|
func GetPostURL(htmlNode gokogirixml.Node) (string, error) {
|
|
|
|
//s := QuerySelector(doc, "a._5pcq")
|
|
//
|
|
//url, ok := s.Attr("href")
|
|
//if !ok {
|
|
// //TODO: Fix and check all URLs
|
|
// rxRelaxed := xurls.Relaxed()
|
|
// url = rxRelaxed.FindString(doc.Text())
|
|
//
|
|
// if len(url) == 0 {
|
|
// return "", errors.New("cannot find post url")
|
|
// }
|
|
//} else {
|
|
// url = "https://www.facebook.com" + url
|
|
//}
|
|
results, err := htmlNode.Search(xPathPostingURL)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
var genURL string
|
|
var attrib *gokogirixml.AttributeNode
|
|
if len(results) > 0 {
|
|
attrib = results[0].Attribute("href")
|
|
//fmt.Printf("%#v\n\n", attrib)
|
|
if attrib != nil {
|
|
tmpPostURL := attrib.Value()
|
|
//fmt.Printf("%#v\n", tmpPostURL)
|
|
tmpPostURL = html.UnescapeString(tmpPostURL)
|
|
//fmt.Printf("%#v\n", tmpPostURL)
|
|
tmpPostURL2, err := url.Parse(tmpPostURL)
|
|
//fmt.Printf("%#v -- %#v\n", tmpPostURL2, err)
|
|
if err != nil {
|
|
attrib = nil
|
|
goto nextPostLinkChecker
|
|
}
|
|
|
|
//fmt.Printf("%#v\n", tmpPostURL2)
|
|
|
|
newQuery := url.Values{}
|
|
oldQuery := tmpPostURL2.Query()
|
|
newQuery.Set("nx_source", oldQuery.Get("nx_source"))
|
|
newQuery.Set("gift_id", oldQuery.Get("gift_id"))
|
|
|
|
tmpURLGen := url.URL{
|
|
Scheme: tmpPostURL2.Scheme,
|
|
Host: tmpPostURL2.Host,
|
|
Path: `herowarsgame/posts/` + dataFT.TopLevelPostID,
|
|
}
|
|
if tmpURLGen.Scheme != "https" {
|
|
tmpURLGen.Scheme = "https"
|
|
}
|
|
if tmpURLGen.Host != "www.facebook.com" {
|
|
tmpURLGen.Host = "www.facebook.com"
|
|
}
|
|
|
|
// dataFT.
|
|
|
|
genURL = tmpURLGen.String()
|
|
}
|
|
//fmt.Printf("GENURL: %#v\n", genURL)
|
|
}
|
|
nextPostLinkChecker:
|
|
|
|
resultURL, err := ExpandURL2(genURL, fbPageURL)
|
|
if err != nil {
|
|
return "", errors.New("cannot find post url - ExpandURL2")
|
|
}
|
|
|
|
return resultURL, nil
|
|
}
|
|
|
|
type object interface {
|
|
Find(string) *goquery.Selection
|
|
}
|
|
|
|
// QuerySelector QuerySelector
|
|
func QuerySelector(s object, selector string) *goquery.Selection {
|
|
return s.Find(selector).First()
|
|
}
|
|
|
|
// QuerySelectorEach QuerySelectorEach
|
|
func QuerySelectorEach(s object, selector string, mf func(i int, selection *goquery.Selection)) *goquery.Selection {
|
|
return s.Find(selector).Each(mf)
|
|
}
|
|
|
|
// ParseTimeStamp ParseTimeStamp
|
|
func ParseTimeStamp(utime int64) (string, error) {
|
|
t := time.Unix(utime, 0)
|
|
return t.Format(time.RFC3339), nil
|
|
}
|
|
|
|
// GetTimeStamp GetTimeStamp
|
|
func GetTimeStamp(htmlNode gokogirixml.Node) (string, error) {
|
|
//s := QuerySelector(doc, "._5ptz.timestamp.livetimestamp")
|
|
//s := QuerySelector(doc, "abbr._5ptz")
|
|
|
|
results, err := htmlNode.Search(xPathArticle)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if attrib := results[0].Attribute("data-ft"); attrib != nil {
|
|
json := attrib.Value()
|
|
normalJSON := html.UnescapeString(json)
|
|
dataFTinternal, err := UnmarshalDataFT([]byte(normalJSON))
|
|
|
|
dataFT = &dataFTinternal
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
}
|
|
pid := dataFTinternal.PageID
|
|
return ParseTimeStamp(dataFTinternal.PageInsights[pid].PostContext.PublishTime)
|
|
|
|
}
|
|
|
|
return "", errors.New("cannot find timestamp")
|
|
}
|
|
|
|
// ProfileLink ProfileLink
|
|
type ProfileLink struct {
|
|
Name string
|
|
URL string
|
|
}
|
|
|
|
// GetProfileLink GetProfileLink
|
|
func GetProfileLink(doc *goquery.Document) (*ProfileLink, error) {
|
|
s := QuerySelector(doc, "a.profileLink")
|
|
if s.Length() == 0 {
|
|
s = QuerySelector(doc, "span.fwb.fcg > a")
|
|
}
|
|
|
|
pl := ProfileLink{}
|
|
|
|
pl.Name = s.Text()
|
|
if pl.Name == "" {
|
|
return nil, errors.New("cannot find name of profile link")
|
|
}
|
|
|
|
url, ok := s.Attr("href")
|
|
if !ok {
|
|
return nil, errors.New("cannot find url of profile link")
|
|
}
|
|
pl.URL = url
|
|
|
|
return &pl, nil
|
|
}
|
|
|
|
// GetBlogspotTimeStamp GetBlogspotTimeStamp
|
|
func GetBlogspotTimeStamp(doc *goquery.Document) (string, error) {
|
|
abbr := QuerySelector(doc, "a.timestamp-link > abbr")
|
|
t, ok := abbr.Attr("title")
|
|
if ok {
|
|
return t, nil
|
|
}
|
|
|
|
return "", errors.New("cannot find timestamp")
|
|
}
|
|
|
|
// GetBlogspotTitle GetBlogspotTitle
|
|
func GetBlogspotTitle(doc *goquery.Document) (string, error) {
|
|
t := QuerySelector(doc, "h3.post-title")
|
|
return strings.TrimSpace(t.Text()), nil
|
|
}
|
|
|
|
//GetBlogspotContent GetBlogspotContent
|
|
func GetBlogspotContent(doc *goquery.Document) (string, error) {
|
|
c := QuerySelector(doc, "div.post-body")
|
|
|
|
s, err := c.Html()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
var lines []string
|
|
|
|
scanner := bufio.NewScanner(strings.NewReader(s))
|
|
for scanner.Scan() {
|
|
lines = append(lines, " "+scanner.Text())
|
|
}
|
|
|
|
if err := scanner.Err(); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return strings.Join(lines, "\n"), nil
|
|
}
|
|
|
|
//GetBlogspotURL GetBlogspotURL
|
|
func GetBlogspotURL(doc *goquery.Document) (string, error) {
|
|
meta := QuerySelector(doc, "meta[property='og:url']")
|
|
u, ok := meta.Attr("content")
|
|
if ok {
|
|
return u, nil
|
|
}
|
|
|
|
return "", errors.New("cannot find url")
|
|
}
|
|
|
|
// GetBlogspotSummary GetBlogspotSummary
|
|
func GetBlogspotSummary(doc *goquery.Document) (string, error) {
|
|
meta := QuerySelector(doc, "meta[property='og:description']")
|
|
d, ok := meta.Attr("content")
|
|
if ok {
|
|
return strings.TrimSpace(d), nil
|
|
}
|
|
|
|
return "", errors.New("cannot find summary")
|
|
}
|
|
|
|
// GetBlogspotAuthor GetBlogspotAuthor
|
|
func GetBlogspotAuthor(doc *goquery.Document) (string, error) {
|
|
a := QuerySelector(doc, "span.post-author > span.fn")
|
|
return a.Text(), nil
|
|
}
|
|
|
|
//GetBlogspotTags GetBlogspotTags
|
|
func GetBlogspotTags(doc *goquery.Document) (string, error) {
|
|
s := doc.Find("span.post-labels > a")
|
|
labels := ""
|
|
s.Each(func(_ int, l *goquery.Selection) {
|
|
if labels != "" {
|
|
labels += ", "
|
|
}
|
|
labels += l.Text()
|
|
})
|
|
return labels, nil
|
|
}
|
|
|
|
// ParseBlogspotPost ParseBlogspotPost
|
|
func ParseBlogspotPost(doc *goquery.Document) (*FBPostData, error) {
|
|
bs := FBPostData{}
|
|
var err error
|
|
|
|
bs.TimeStamp, err = GetBlogspotTimeStamp(doc)
|
|
if err != nil {
|
|
return &bs, err
|
|
}
|
|
|
|
bs.Title, err = GetBlogspotTitle(doc)
|
|
if err != nil {
|
|
return &bs, err
|
|
}
|
|
|
|
bs.Content, err = GetBlogspotContent(doc)
|
|
if err != nil {
|
|
return &bs, err
|
|
}
|
|
|
|
bs.PostURL, err = GetBlogspotURL(doc)
|
|
if err != nil {
|
|
return &bs, err
|
|
}
|
|
|
|
bs.Summary, err = GetBlogspotSummary(doc)
|
|
if err != nil {
|
|
return &bs, err
|
|
}
|
|
|
|
bs.Author, err = GetBlogspotAuthor(doc)
|
|
if err != nil {
|
|
return &bs, err
|
|
}
|
|
|
|
bs.Tags, err = GetBlogspotTags(doc)
|
|
if err != nil {
|
|
return &bs, err
|
|
}
|
|
|
|
return &bs, nil
|
|
}
|