rss/bbclearn/bbc.go

166 lines
3.4 KiB
Go
Raw Normal View History

2023-04-13 18:03:56 +00:00
package bbclearn
import (
"github.com/PuerkitoBio/goquery"
"github.com/fthvgb1/wp-go/rss2"
2023-07-09 15:02:46 +00:00
"github.com/fthvgb1/wp-go/safety"
2023-07-09 14:57:47 +00:00
"io"
"log"
2023-04-13 18:03:56 +00:00
"net/http"
2023-07-09 14:57:47 +00:00
"os"
"path/filepath"
"rss/mail"
2023-04-13 18:03:56 +00:00
"strings"
"time"
)
func LearnParse(s string, recentDay int) string {
document, err := goquery.NewDocumentFromReader(strings.NewReader(s))
if err != nil {
return ""
}
var item []rss2.Item
item = append(item, full(document, recentDay))
item = append(item, items(document, recentDay)...)
2023-04-14 14:46:06 +00:00
if len(item) < 1 {
return ""
}
2023-04-13 18:03:56 +00:00
rss := rss2.Rss2{
Title: "BBC 英语教学",
2023-04-13 18:48:44 +00:00
Link: "https://www.bbc.co.uk/learningenglish/chinese/",
2023-04-13 18:03:56 +00:00
LastBuildDate: time.Now().Format(time.RFC1123Z),
Items: item,
}
return rss.GetXML()
}
2023-04-14 15:26:53 +00:00
func parseTime(u string) (date time.Time, err error) {
2023-04-13 18:03:56 +00:00
uu := strings.Split(u, "-")
if len(uu) < 2 {
return
}
2023-04-14 15:26:53 +00:00
date, err = time.Parse("060102", uu[len(uu)-1])
if err != nil {
return
}
return
}
func dateFilter(u string, recentDay int) (r bool) {
date, err := parseTime(u)
2023-04-13 18:03:56 +00:00
if err != nil {
return
}
t := time.Now()
2023-05-02 16:31:40 +00:00
if t.Sub(date).Hours()/24-float64(recentDay) > 0 {
2023-04-13 18:03:56 +00:00
return
}
r = true
return
}
func fetch(u string) (r rss2.Item) {
res, err := http.Get(u)
if err != nil {
return
}
dom, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return
}
s := dom.Find("#bbcle-content .widget-container-left")
content, err := goquery.OuterHtml(s.Find(".widget-list,.widget-pagelink").Remove().End())
if err != nil {
return
}
r.Title = s.Find(`div[data-widget-index="3"] h3`).Text()
2023-04-14 15:26:53 +00:00
date, _ := parseTime(u)
r.PubDate = date.Format(time.RFC1123Z)
2023-04-13 18:03:56 +00:00
r.Guid = u
r.Description = content
2023-07-09 15:02:46 +00:00
if _, ok := hadSend.Load(r.Title); !ok {
go downAndSendMail(dom, r.Title)
}
2023-04-13 18:03:56 +00:00
return
}
2023-07-09 15:02:46 +00:00
var hadSend = safety.NewMap[string, struct{}]()
2023-07-09 14:57:47 +00:00
func downAndSendMail(doc *goquery.Document, title string) {
2023-07-11 06:54:17 +00:00
if err := mail.CheckEnv(); err != nil {
log.Println("err", err)
return
}
2023-07-09 14:57:47 +00:00
type m struct {
tit string
content string
f []string
}
mm := m{}
var err error
mm.tit = title
mm.content, err = doc.Find(".widget-richtext .text").Html()
if err != nil {
return
}
for _, ss := range []string{".bbcle-download-extension-pdf", ".bbcle-download-extension-mp3"} {
uu, ok := doc.Find(ss).Attr("href")
if ok {
response, err := http.Get(uu)
if err != nil {
continue
}
name := filepath.Base(uu)
file, err := os.OpenFile(name, os.O_CREATE|os.O_WRONLY, 0755)
if err != nil {
continue
}
_, err = io.Copy(file, response.Body)
if err != nil {
continue
}
mm.f = append(mm.f, name)
}
}
if len(mm.f) < 1 {
return
}
2023-07-11 04:12:03 +00:00
err = mail.SendMail(mm.tit, mm.content, mm.f...)
if err != nil {
log.Println("err", err)
return
}
2023-07-09 15:02:46 +00:00
hadSend.Store(mm.tit, struct{}{})
2023-07-09 14:57:47 +00:00
for _, s := range mm.f {
err := os.Remove(s)
if err != nil {
log.Printf("delete file %s err:%v\n", s, err)
}
}
}
2023-04-13 18:03:56 +00:00
func full(doc *goquery.Document, recentDay int) (r rss2.Item) {
a := doc.Find("#bbcle-content .widget-container-full a")
u, ok := a.Attr("href")
if !ok {
return
}
if !dateFilter(u, recentDay) {
return
}
r = fetch(u)
return
}
func items(doc *goquery.Document, recentDay int) (r []rss2.Item) {
doc.Find("#bbcle-content > div > div.widget-container.widget-container-full > div.widget.widget-image.widget-image-two_column > div a").Each(func(i int, s *goquery.Selection) {
u, ok := s.Attr("href")
if !ok || !dateFilter(u, recentDay) {
return
}
r = append(r, fetch(u))
})
return
}