198 lines
5.4 KiB
Go
198 lines
5.4 KiB
Go
package main
|
|
|
|
import (
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/gin-gonic/gin"
|
|
"github.com/gorilla/websocket"
|
|
"log"
|
|
"net/http"
|
|
"time"
|
|
)
|
|
|
|
type fetchData struct {
|
|
url string
|
|
title string
|
|
desc string
|
|
date string
|
|
}
|
|
type fetchHandler struct {
|
|
fetchUrl string
|
|
hadFetchData []fetchData
|
|
cronTime time.Duration
|
|
keyword string
|
|
hadFetchedMap map[string]int
|
|
reloadCron chan int
|
|
isOff chan int
|
|
ws *websocket.Conn
|
|
rMsgChan chan message
|
|
newFetchItem chan []fetchData
|
|
}
|
|
|
|
type message struct {
|
|
Status bool
|
|
Action string
|
|
Message string
|
|
Data interface{}
|
|
}
|
|
|
|
func newFetchHandler(fetchUrl, keyword string) *fetchHandler {
|
|
return &fetchHandler{
|
|
fetchUrl: fetchUrl,
|
|
keyword: keyword,
|
|
hadFetchedMap: make(map[string]int),
|
|
cronTime: 60 * time.Second,
|
|
reloadCron: make(chan int),
|
|
isOff: make(chan int),
|
|
rMsgChan: make(chan message, 10),
|
|
newFetchItem: make(chan []fetchData, 10),
|
|
}
|
|
}
|
|
|
|
func (f *fetchHandler) handle() {
|
|
f.parsesDom(f.fetch(f.fetchUrl + f.keyword))
|
|
}
|
|
|
|
func (f *fetchHandler) receiveMsg() {
|
|
for {
|
|
r := <-f.rMsgChan
|
|
switch r.Action {
|
|
case "search":
|
|
f.handle()
|
|
case "timeStepSet":
|
|
if t, ok := r.Data.(int); ok {
|
|
f.reloadCron <- t
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
func (f *fetchHandler) fetch(url string) *http.Response {
|
|
client := http.Client{
|
|
Transport: nil,
|
|
CheckRedirect: nil,
|
|
Jar: nil,
|
|
Timeout: 10 * time.Second,
|
|
}
|
|
req, _ := http.NewRequest("GET", url, nil)
|
|
|
|
req.Header.Add("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
|
|
req.Header.Add("accept-language", "zh-CN,zh;q=0.9")
|
|
req.Header.Add("cache-control", "no-cache")
|
|
req.Header.Add("connection", "keep-alive")
|
|
req.Header.Add("cookie", "BIDUPSID=844E3DCAA2EEBF5C872DC99B967B6B7B; PSTM=1655872163; BAIDUID=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; BD_UPN=123353; ORIGIN=2; ISSW=1; ISSW=1; BAIDUID_BFESS=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; ZFY=jWFAySgO:AoQfb6emY9vnmEdptVao:Anj0FFkp028wFws:C; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=3; COOKIE_SESSION=42_0_2_2_3_0_1_0_2_0_0_0_18_0_51_0_1655888428_0_1655888377%7C3%230_0_1655888377%7C1; BAIDU_WISE_UID=wapp_1655902298617_702; ZD_ENTRY=google; channel=baidusearch; baikeVisitId=b3b23509-9330-4d33-82ae-b8eb37895917; BA_HECTOR=8k2g2g218ga40181ak1hbgg1n14; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDSVRTM=1011; H_PS_PSSID=36550_36459_36673_36455_36453_36692_36165_36695_36697_36569_36075_36467_36316_36651")
|
|
req.Header.Add("referer", "http://news.baidu.com/")
|
|
req.Header.Add("sec-fetch-dest", "document")
|
|
req.Header.Add("sec-fetch-mode", "navigate")
|
|
req.Header.Add("sec-fetch-site", "cross-site")
|
|
req.Header.Add("sec-fetch-user", "?1")
|
|
req.Header.Add("upgrade-insecure-requests", "1")
|
|
req.Header.Add("user-agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36")
|
|
req.Header.Add("#sec-ch-ua", "\".Not/A)Brand\";v=\"99\", \"Google Chrome\";v=\"103\", \"Chromium\";v=\"103\"")
|
|
req.Header.Add("sec-ch-ua-mobile", "?0")
|
|
req.Header.Add("sec-ch-ua-platform", "\"Linux\"")
|
|
req.Header.Add("postman-token", "81407fbc-2b96-54a7-0193-f640156714ab")
|
|
|
|
response, err := client.Do(req)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return response
|
|
}
|
|
|
|
func (f *fetchHandler) parsesDom(html *http.Response) {
|
|
doc, err := goquery.NewDocumentFromReader(html.Body)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
doc.Find("div[class=\"result-op c-container xpath-log new-pmd\"]").Each(func(i int, selection *goquery.Selection) {
|
|
data := fetchData{}
|
|
data.url, _ = selection.Attr("mu")
|
|
t := selection.Find(".news-title-font_1xS-F").First()
|
|
data.title = t.Text()
|
|
data.desc = selection.Find(".c-row .c-color-text").First().Text()
|
|
k := data.url + "_" + data.title
|
|
var newFetch []fetchData
|
|
if _, ok := f.hadFetchedMap[k]; !ok {
|
|
f.hadFetchData = append(f.hadFetchData, data)
|
|
f.hadFetchedMap[k] = 1
|
|
newFetch = append(newFetch, data)
|
|
}
|
|
f.newFetchItem <- newFetch
|
|
})
|
|
|
|
err = html.Body.Close()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
}
|
|
|
|
func (f *fetchHandler) sendFetchData() {
|
|
for {
|
|
data := <-f.newFetchItem
|
|
err := f.ws.WriteJSON(message{
|
|
Status: true,
|
|
Action: "newData",
|
|
Message: "",
|
|
Data: data,
|
|
})
|
|
if err != nil {
|
|
log.Println(err)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (f *fetchHandler) cronFetch() {
|
|
t := time.NewTicker(f.cronTime)
|
|
defer t.Stop()
|
|
for {
|
|
select {
|
|
case <-t.C:
|
|
f.handle()
|
|
case tt := <-f.reloadCron:
|
|
f.cronTime = time.Duration(tt) * time.Second
|
|
go f.cronFetch()
|
|
return
|
|
case <-f.isOff:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func main() {
|
|
h := newFetchHandler("https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&rsv_dl=ns_pc&word=", "纪检")
|
|
router := gin.Default()
|
|
var upgrader = websocket.Upgrader{}
|
|
router.LoadHTMLGlob("templates/*")
|
|
//router.LoadHTMLFiles("templates/template1.html", "templates/template2.html")
|
|
router.GET("/index", func(c *gin.Context) {
|
|
c.HTML(http.StatusOK, "index.gohtml", gin.H{
|
|
"title": "爬虫",
|
|
"keyword": h.keyword,
|
|
"timeStep": h.cronTime.Seconds(),
|
|
})
|
|
})
|
|
router.GET("ws", func(c *gin.Context) {
|
|
conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
|
|
if err != nil {
|
|
c.JSON(201, message{
|
|
Status: false,
|
|
Message: err.Error(),
|
|
Data: nil,
|
|
Action: "upgradeWs",
|
|
})
|
|
return
|
|
}
|
|
h.ws = conn
|
|
msg := message{}
|
|
for {
|
|
err := conn.ReadJSON(msg)
|
|
if err != nil {
|
|
return
|
|
}
|
|
h.rMsgChan <- msg
|
|
}
|
|
})
|
|
router.Run(":8080")
|
|
}
|