package main import ( "github.com/PuerkitoBio/goquery" "github.com/gin-gonic/gin" "github.com/gorilla/websocket" "log" "net/http" "time" ) type fetchData struct { url string title string desc string date string } type fetchHandler struct { fetchUrl string hadFetchData []fetchData cronTime time.Duration keyword string hadFetchedMap map[string]int reloadCron chan int isOff chan int ws *websocket.Conn rMsgChan chan message newFetchItem chan []fetchData } type message struct { Status bool Action string Message string Data interface{} } func newFetchHandler(fetchUrl, keyword string) *fetchHandler { return &fetchHandler{ fetchUrl: fetchUrl, keyword: keyword, hadFetchedMap: make(map[string]int), cronTime: 60 * time.Second, reloadCron: make(chan int), isOff: make(chan int), rMsgChan: make(chan message, 10), newFetchItem: make(chan []fetchData, 10), } } func (f *fetchHandler) handle() { f.parsesDom(f.fetch(f.fetchUrl + f.keyword)) } func (f *fetchHandler) receiveMsg() { for { r := <-f.rMsgChan switch r.Action { case "search": f.handle() case "timeStepSet": if t, ok := r.Data.(int); ok { f.reloadCron <- t } } } } func (f *fetchHandler) fetch(url string) *http.Response { client := http.Client{ Transport: nil, CheckRedirect: nil, Jar: nil, Timeout: 10 * time.Second, } req, _ := http.NewRequest("GET", url, nil) req.Header.Add("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") req.Header.Add("accept-language", "zh-CN,zh;q=0.9") req.Header.Add("cache-control", "no-cache") req.Header.Add("connection", "keep-alive") req.Header.Add("cookie", "BIDUPSID=844E3DCAA2EEBF5C872DC99B967B6B7B; PSTM=1655872163; BAIDUID=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; BD_UPN=123353; ORIGIN=2; ISSW=1; ISSW=1; BAIDUID_BFESS=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; ZFY=jWFAySgO:AoQfb6emY9vnmEdptVao:Anj0FFkp028wFws:C; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=3; COOKIE_SESSION=42_0_2_2_3_0_1_0_2_0_0_0_18_0_51_0_1655888428_0_1655888377%7C3%230_0_1655888377%7C1; BAIDU_WISE_UID=wapp_1655902298617_702; ZD_ENTRY=google; channel=baidusearch; baikeVisitId=b3b23509-9330-4d33-82ae-b8eb37895917; BA_HECTOR=8k2g2g218ga40181ak1hbgg1n14; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDSVRTM=1011; H_PS_PSSID=36550_36459_36673_36455_36453_36692_36165_36695_36697_36569_36075_36467_36316_36651") req.Header.Add("referer", "http://news.baidu.com/") req.Header.Add("sec-fetch-dest", "document") req.Header.Add("sec-fetch-mode", "navigate") req.Header.Add("sec-fetch-site", "cross-site") req.Header.Add("sec-fetch-user", "?1") req.Header.Add("upgrade-insecure-requests", "1") req.Header.Add("user-agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36") req.Header.Add("#sec-ch-ua", "\".Not/A)Brand\";v=\"99\", \"Google Chrome\";v=\"103\", \"Chromium\";v=\"103\"") req.Header.Add("sec-ch-ua-mobile", "?0") req.Header.Add("sec-ch-ua-platform", "\"Linux\"") req.Header.Add("postman-token", "81407fbc-2b96-54a7-0193-f640156714ab") response, err := client.Do(req) if err != nil { panic(err) } return response } func (f *fetchHandler) parsesDom(html *http.Response) { doc, err := goquery.NewDocumentFromReader(html.Body) if err != nil { panic(err) } doc.Find("div[class=\"result-op c-container xpath-log new-pmd\"]").Each(func(i int, selection *goquery.Selection) { data := fetchData{} data.url, _ = selection.Attr("mu") t := selection.Find(".news-title-font_1xS-F").First() data.title = t.Text() data.desc = selection.Find(".c-row .c-color-text").First().Text() k := data.url + "_" + data.title var newFetch []fetchData if _, ok := f.hadFetchedMap[k]; !ok { f.hadFetchData = append(f.hadFetchData, data) f.hadFetchedMap[k] = 1 newFetch = append(newFetch, data) } f.newFetchItem <- newFetch }) err = html.Body.Close() if err != nil { panic(err) } } func (f *fetchHandler) sendFetchData() { for { data := <-f.newFetchItem err := f.ws.WriteJSON(message{ Status: true, Action: "newData", Message: "", Data: data, }) if err != nil { log.Println(err) } } } func (f *fetchHandler) cronFetch() { t := time.NewTicker(f.cronTime) defer t.Stop() for { select { case <-t.C: f.handle() case tt := <-f.reloadCron: f.cronTime = time.Duration(tt) * time.Second go f.cronFetch() return case <-f.isOff: return } } } func main() { h := newFetchHandler("https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&rsv_dl=ns_pc&word=", "纪检") router := gin.Default() var upgrader = websocket.Upgrader{} router.LoadHTMLGlob("templates/*") //router.LoadHTMLFiles("templates/template1.html", "templates/template2.html") router.GET("/index", func(c *gin.Context) { c.HTML(http.StatusOK, "index.gohtml", gin.H{ "title": "爬虫", "keyword": h.keyword, "timeStep": h.cronTime.Seconds(), }) }) router.GET("ws", func(c *gin.Context) { conn, err := upgrader.Upgrade(c.Writer, c.Request, nil) if err != nil { c.JSON(201, message{ Status: false, Message: err.Error(), Data: nil, Action: "upgradeWs", }) return } h.ws = conn msg := message{} for { err := conn.ReadJSON(msg) if err != nil { return } h.rMsgChan <- msg } }) router.Run(":8080") }