package main import ( "bufio" "errors" "html" "net/url" "strings" "time" "github.com/PuerkitoBio/goquery" "github.com/jbowtie/gokogiri" gokogirixml "github.com/jbowtie/gokogiri/xml" "github.com/lunny/html2md" "github.com/microcosm-cc/bluemonday" "mvdan.cc/xurls/v2" ) //FBPostData FBPostData type FBPostData struct { PostURL string TimeStamp string ProfileLink *ProfileLink GiftURL string ImageURL string Content string Summary string Title string Author string Tags string } // ParsePost ParsePost func ParsePost(s, PostURL string) (*FBPostData, error) { fb := FBPostData{PostURL: PostURL} docKogiri, err := gokogiri.ParseHtml([]byte(s)) if err != nil { return &fb, err } defer docKogiri.Free() htmlNode := docKogiri.Root().FirstChild() //doc, err := goquery.NewDocumentFromReader(strings.NewReader(s)) //if err != nil { // return &fb, err //} fb.TimeStamp, err = GetTimeStamp(htmlNode) if err != nil { return &fb, err } //fmt.Printf("\n\n%#v\n\n", dataFT) //fb.ProfileLink, err = GetProfileLink(doc) //if err != nil { // return &fb, err //} fb.ImageURL, err = GetImageURL(htmlNode) if err != nil { return &fb, err } fb.GiftURL, err = GetGiftURL(htmlNode) if err != nil { return &fb, err } fb.PostURL, err = GetPostURL(htmlNode) if err != nil { fb.PostURL = PostURL return &fb, err } fb.Content, err = GetContent(htmlNode) if err != nil { return &fb, err } if strings.Contains(fb.Content, SearchFreeSilverCaskets) { if len(fb.Title) == 0 { fb.Title = TitleFreeSilverCaskets } else { fb.Title = TitleFreeSilverCaskets + "+" + fb.Title } } if strings.Contains(fb.Content, SearchFreeTitanArtifact) { if len(fb.Title) == 0 { fb.Title = TitleFreeTitanArtifact } else { fb.Title = TitleFreeTitanArtifact + "+" + fb.Title } } if strings.Contains(fb.Content, SearchFreeSoulStones) { if len(fb.Title) == 0 { fb.Title = TitleFreeSoulStones } else { fb.Title = TitleFreeSoulStones + "+" + fb.Title } } if strings.Contains(fb.Content, SearchFreeSkinStones) { if len(fb.Title) == 0 { fb.Title = TitleFreeSkinStones } else { fb.Title = TitleFreeSkinStones + "+" + fb.Title } } if strings.Contains(fb.Content, SearchActionKeepTheAmount) { if len(fb.Title) == 0 { fb.Title = TitleActionKeepTheAmount } else { fb.Title = TitleActionKeepTheAmount + "+" + fb.Title } fb.GiftURL = fb.PostURL } if strings.Contains(fb.Content, SearchFreeWinterfestBaubles) { if len(fb.Title) == 0 { fb.Title = TitleFreeWinterfestBaubles } else { fb.Title = TitleFreeWinterfestBaubles + "+" + fb.Title } } if strings.Contains(fb.Content, SearchFreeTopFanPackage) { if len(fb.Title) == 0 { fb.Title = TitleFreeTopFanPackage } else { fb.Title = TitleFreeTopFanPackage + "+" + fb.Title } } if strings.Contains(fb.Content, SearchFreeEnergyForFee) { if len(fb.Title) == 0 { fb.Title = TitleFreeEnergyForFee } else { fb.Title = TitleFreeEnergyForFee + "+" + fb.Title } } if strings.Contains(fb.Content, SearchWinterfestRankingRewards) { if len(fb.Title) == 0 { fb.Title = TitleWinterfestRankingRewards } else { fb.Title = TitleWinterfestRankingRewards + "+" + fb.Title } } if len(fb.Title) == 0 { fb.Title = "unknown - need to implemented" } defer func() { dataFT = nil }() return &fb, nil } // Parse Parse func Parse(url string) (*FBPostData, error) { doc, err := goquery.NewDocument(url) if err != nil { return nil, err } if strings.Contains(url, ".blogspot.") { return ParseBlogspotPost(doc) } // If not login, post looks like //
s := QuerySelector(doc, "div.hidden_elem > code") cmt, err := s.Html() if err != nil { return nil, err } if len(cmt) != 0 { return ParsePost(cmt, url) } s = QuerySelector(doc, "div._427x") cmt, err = s.Html() if err != nil { return nil, err } return ParsePost(cmt, url) } // ParseAll ParseAll func ParseAll(url string) ([]*FBPostData, error) { doc, err := goquery.NewDocument(url) if err != nil { return nil, err } allFbPosts := []*FBPostData{} QuerySelectorEach(doc, "div._427x", func(i int, selected *goquery.Selection) { cmt, err := selected.Html() if err != nil { return } fbPost, err := ParsePost(cmt, url) if err != nil { return } allFbPosts = append(allFbPosts, fbPost) }) return allFbPosts, nil } // GetContent GetContent func GetContent(htmlNode gokogirixml.Node) (string, error) { results, err := htmlNode.Search(xPathArticleContent) if err != nil { //fmt.Printf("ERR: %#v -- %#v\n", results, err) return "", err } //fmt.Printf("ERR: %#v -- %#v\n", results, err) if len(results) > 0 { if resultHTML := results[0].InnerHtml(); resultHTML != "" { bmsanizer := bluemonday.StrictPolicy() bmsanizer.AllowAttrs("href").OnElements("a") bmsanizer.AllowElements("p") bmsanizer.RequireParseableURLs(true) htmlText := bmsanizer.SanitizeBytes([]byte(resultHTML)) content := strings.Join(strings.Fields(string(htmlText)), " ") content = html2md.Convert(content) content = html.UnescapeString(content) content = strings.TrimSpace(content) return content, nil } } return "", errors.New("no content found") } // GetImageURL GetImageURL func GetImageURL(htmlNode gokogirixml.Node) (string, error) { results, err := htmlNode.Search(xPathImageURL) if err != nil { //fmt.Printf("ERR: %#v -- %#v\n", results, err) return "", err } //fmt.Printf("RESULT: %#v -- %#v\n", results, err) // search for the second possible image if len(results) == 0 { results, err = htmlNode.Search(xPathImageURL2) if err != nil { //fmt.Printf("ERR2: %#v -- %#v\n", results, err) return "", err } } // Video Preview if len(results) == 0 { results, err = htmlNode.Search(xPathImageURL3) if err != nil { //fmt.Printf("ERR2: %#v -- %#v\n", results, err) return "", err } } //fmt.Printf("RESULT2: %#v -- %#v\n", results, err) if len(results) > 0 { if attrib := results[0].Attribute("style"); attrib != nil { styleContent := attrib.Value() codedImageURL := regexStyleImage.FindString(styleContent) genURL := strings.ReplaceAll(codedImageURL, `\3a `, `:`) genURL = strings.ReplaceAll(genURL, `\3d `, `=`) genURL = strings.ReplaceAll(genURL, `\26 `, `&`) genURL = strings.ReplaceAll(genURL, `\25 `, `%`) genURL = strings.TrimPrefix(genURL, `url('`) genURL = strings.TrimSuffix(genURL, `');`) return genURL, nil } } return "", errors.New("cannot find image url") //s := QuerySelector(doc, "img.scaledImageFitHeight") //if s.Length() == 0 { // s = QuerySelector(doc, "img.scaledImageFitWidth") //} // //url, ok := s.Attr("src") //if !ok { // return "", errors.New("cannot find image url") //} // //return url, nil } // GetGiftURL GetGiftURL func GetGiftURL(htmlNode gokogirixml.Node) (string, error) { //s := QuerySelector(doc, "div._6ks > a") // //url, ok := s.Attr("href") results, err := htmlNode.Search(xPathGiftURL) if err != nil { // search for the second possible image results, err = htmlNode.Search(xPathImageURL2) if err != nil { return "", err } } var genURL string var attrib *gokogirixml.AttributeNode if len(results) > 0 { attrib = results[0].Attribute("href") if attrib != nil { tmpGiftURL := attrib.Value() tmpGiftURL = html.UnescapeString(tmpGiftURL) tmpGiftURL2, err := url.Parse(tmpGiftURL) if err != nil { attrib = nil goto nextGiftLinkChecker } //fmt.Printf("%#v\n", tmpGiftURL2) newQuery := url.Values{} oldQuery := tmpGiftURL2.Query() newQuery.Set("nx_source", oldQuery.Get("nx_source")) newQuery.Set("gift_id", oldQuery.Get("gift_id")) tmpURLGen := url.URL{ Scheme: tmpGiftURL2.Scheme, Host: tmpGiftURL2.Host, Path: tmpGiftURL2.Path, RawQuery: newQuery.Encode(), } if tmpURLGen.Scheme != "https" { tmpURLGen.Scheme = "https" } genURL = tmpURLGen.String() } } nextGiftLinkChecker: if attrib == nil { results, err := htmlNode.Search(xPathArticleContent) if err != nil { return "", err } if len(results) > 0 { //TODO: Fix and check all URLs rxRelaxed := xurls.Relaxed() genURL = rxRelaxed.FindString(results[0].Content()) } } if len(genURL) == 0 { return "", errors.New("cannot find gift url") } //fmt.Printf("%#v\n", genURL) resultURL, err := ExpandURL2(genURL, fbGameURL) if err != nil { return "", errors.New("cannot find gift url - ExpandURL2") } return resultURL, nil } // GetPostURL GetPostURL func GetPostURL(htmlNode gokogirixml.Node) (string, error) { //s := QuerySelector(doc, "a._5pcq") // //url, ok := s.Attr("href") //if !ok { // //TODO: Fix and check all URLs // rxRelaxed := xurls.Relaxed() // url = rxRelaxed.FindString(doc.Text()) // // if len(url) == 0 { // return "", errors.New("cannot find post url") // } //} else { // url = "https://www.facebook.com" + url //} results, err := htmlNode.Search(xPathPostingURL) if err != nil { return "", err } var genURL string var attrib *gokogirixml.AttributeNode if len(results) > 0 { attrib = results[0].Attribute("href") //fmt.Printf("%#v\n\n", attrib) if attrib != nil { tmpPostURL := attrib.Value() //fmt.Printf("%#v\n", tmpPostURL) tmpPostURL = html.UnescapeString(tmpPostURL) //fmt.Printf("%#v\n", tmpPostURL) tmpPostURL2, err := url.Parse(tmpPostURL) //fmt.Printf("%#v -- %#v\n", tmpPostURL2, err) if err != nil { attrib = nil goto nextPostLinkChecker } //fmt.Printf("%#v\n", tmpPostURL2) newQuery := url.Values{} oldQuery := tmpPostURL2.Query() newQuery.Set("nx_source", oldQuery.Get("nx_source")) newQuery.Set("gift_id", oldQuery.Get("gift_id")) tmpURLGen := url.URL{ Scheme: tmpPostURL2.Scheme, Host: tmpPostURL2.Host, Path: `herowarsgame/posts/` + dataFT.TopLevelPostID, } if tmpURLGen.Scheme != "https" { tmpURLGen.Scheme = "https" } if tmpURLGen.Host != "www.facebook.com" { tmpURLGen.Host = "www.facebook.com" } // dataFT. genURL = tmpURLGen.String() } //fmt.Printf("GENURL: %#v\n", genURL) } nextPostLinkChecker: resultURL, err := ExpandURL2(genURL, fbPageURL) if err != nil { return "", errors.New("cannot find post url - ExpandURL2") } return resultURL, nil } type object interface { Find(string) *goquery.Selection } // QuerySelector QuerySelector func QuerySelector(s object, selector string) *goquery.Selection { return s.Find(selector).First() } // QuerySelectorEach QuerySelectorEach func QuerySelectorEach(s object, selector string, mf func(i int, selection *goquery.Selection)) *goquery.Selection { return s.Find(selector).Each(mf) } // ParseTimeStamp ParseTimeStamp func ParseTimeStamp(utime int64) (string, error) { t := time.Unix(utime, 0) return t.Format(time.RFC3339), nil } // GetTimeStamp GetTimeStamp func GetTimeStamp(htmlNode gokogirixml.Node) (string, error) { //s := QuerySelector(doc, "._5ptz.timestamp.livetimestamp") //s := QuerySelector(doc, "abbr._5ptz") results, err := htmlNode.Search(xPathArticle) if err != nil { return "", err } if attrib := results[0].Attribute("data-ft"); attrib != nil { json := attrib.Value() normalJSON := html.UnescapeString(json) dataFTinternal, err := UnmarshalDataFT([]byte(normalJSON)) dataFT = &dataFTinternal if err != nil { return "", err } pid := dataFTinternal.PageID return ParseTimeStamp(dataFTinternal.PageInsights[pid].PostContext.PublishTime) } return "", errors.New("cannot find timestamp") } // ProfileLink ProfileLink type ProfileLink struct { Name string URL string } // GetProfileLink GetProfileLink func GetProfileLink(doc *goquery.Document) (*ProfileLink, error) { s := QuerySelector(doc, "a.profileLink") if s.Length() == 0 { s = QuerySelector(doc, "span.fwb.fcg > a") } pl := ProfileLink{} pl.Name = s.Text() if pl.Name == "" { return nil, errors.New("cannot find name of profile link") } url, ok := s.Attr("href") if !ok { return nil, errors.New("cannot find url of profile link") } pl.URL = url return &pl, nil } // GetBlogspotTimeStamp GetBlogspotTimeStamp func GetBlogspotTimeStamp(doc *goquery.Document) (string, error) { abbr := QuerySelector(doc, "a.timestamp-link > abbr") t, ok := abbr.Attr("title") if ok { return t, nil } return "", errors.New("cannot find timestamp") } // GetBlogspotTitle GetBlogspotTitle func GetBlogspotTitle(doc *goquery.Document) (string, error) { t := QuerySelector(doc, "h3.post-title") return strings.TrimSpace(t.Text()), nil } //GetBlogspotContent GetBlogspotContent func GetBlogspotContent(doc *goquery.Document) (string, error) { c := QuerySelector(doc, "div.post-body") s, err := c.Html() if err != nil { return "", err } var lines []string scanner := bufio.NewScanner(strings.NewReader(s)) for scanner.Scan() { lines = append(lines, " "+scanner.Text()) } if err := scanner.Err(); err != nil { return "", err } return strings.Join(lines, "\n"), nil } //GetBlogspotURL GetBlogspotURL func GetBlogspotURL(doc *goquery.Document) (string, error) { meta := QuerySelector(doc, "meta[property='og:url']") u, ok := meta.Attr("content") if ok { return u, nil } return "", errors.New("cannot find url") } // GetBlogspotSummary GetBlogspotSummary func GetBlogspotSummary(doc *goquery.Document) (string, error) { meta := QuerySelector(doc, "meta[property='og:description']") d, ok := meta.Attr("content") if ok { return strings.TrimSpace(d), nil } return "", errors.New("cannot find summary") } // GetBlogspotAuthor GetBlogspotAuthor func GetBlogspotAuthor(doc *goquery.Document) (string, error) { a := QuerySelector(doc, "span.post-author > span.fn") return a.Text(), nil } //GetBlogspotTags GetBlogspotTags func GetBlogspotTags(doc *goquery.Document) (string, error) { s := doc.Find("span.post-labels > a") labels := "" s.Each(func(_ int, l *goquery.Selection) { if labels != "" { labels += ", " } labels += l.Text() }) return labels, nil } // ParseBlogspotPost ParseBlogspotPost func ParseBlogspotPost(doc *goquery.Document) (*FBPostData, error) { bs := FBPostData{} var err error bs.TimeStamp, err = GetBlogspotTimeStamp(doc) if err != nil { return &bs, err } bs.Title, err = GetBlogspotTitle(doc) if err != nil { return &bs, err } bs.Content, err = GetBlogspotContent(doc) if err != nil { return &bs, err } bs.PostURL, err = GetBlogspotURL(doc) if err != nil { return &bs, err } bs.Summary, err = GetBlogspotSummary(doc) if err != nil { return &bs, err } bs.Author, err = GetBlogspotAuthor(doc) if err != nil { return &bs, err } bs.Tags, err = GetBlogspotTags(doc) if err != nil { return &bs, err } return &bs, nil }