newfetch/main.go

325 lines
8.2 KiB
Go
Raw Normal View History

2022-07-01 05:45:49 +00:00
package main
import (
"github.com/PuerkitoBio/goquery"
"github.com/gin-gonic/gin"
"github.com/gorilla/websocket"
"log"
"net/http"
2022-07-05 03:54:09 +00:00
"regexp"
"strconv"
"strings"
2022-07-06 02:55:28 +00:00
"sync"
2022-07-01 05:45:49 +00:00
"time"
)
type fetchData struct {
2022-07-06 09:03:42 +00:00
Url string `json:"url"`
Title string `json:"title"`
Desc string `json:"desc"`
Date string `json:"date"`
CreatedTime string `json:"created_time"`
2022-07-01 05:45:49 +00:00
}
2022-07-05 03:54:09 +00:00
type connChan struct {
conn string
msg message
}
type dataChan struct {
conn string
item []fetchData
}
2022-07-01 05:45:49 +00:00
type fetchHandler struct {
fetchUrl string
hadFetchData []fetchData
2022-07-06 09:03:42 +00:00
cronTime mapXS[time.Duration]
keyword mapXS[string]
hadFetchedMap mapXS[int]
reloadCron mapXS[chan int]
2022-07-01 05:45:49 +00:00
isOff chan int
2022-07-05 03:54:09 +00:00
rMsgChan chan connChan
newFetchItem chan dataChan
2022-07-06 09:03:42 +00:00
connMap mapXS[*websocket.Conn]
2022-07-05 03:54:09 +00:00
}
type setting struct {
Keyword string `json:"keyword"`
TimeStep int `json:"timeStep"`
2022-07-01 05:45:49 +00:00
}
type message struct {
Status bool
Action string
Message string
Data interface{}
}
2022-07-06 09:03:42 +00:00
func setMap[T mapT](obj *mapXS[T], key string, v T) {
2022-07-06 02:55:28 +00:00
obj.Lock()
(*obj.mapX)[key] = v
obj.Unlock()
}
2022-07-06 09:03:42 +00:00
func delMap[T mapT](obj *mapXS[T], key string) {
obj.Lock()
delete(*obj.mapX, key)
obj.Unlock()
}
type mapT interface {
string | int | time.Duration | *websocket.Conn | chan int
}
type mapX[T mapT] map[string]T
2022-07-06 02:55:28 +00:00
2022-07-06 09:03:42 +00:00
type mapXS[T mapT] struct {
2022-07-06 02:55:28 +00:00
*mapX[T]
*sync.Mutex
}
2022-07-05 03:54:09 +00:00
func newFetchHandler(fetchUrl string) *fetchHandler {
2022-07-01 05:45:49 +00:00
return &fetchHandler{
2022-07-06 09:03:42 +00:00
fetchUrl: fetchUrl,
keyword: mapXS[string]{
&mapX[string]{},
&sync.Mutex{},
},
hadFetchedMap: mapXS[int]{
&mapX[int]{},
&sync.Mutex{},
},
cronTime: mapXS[time.Duration]{
&mapX[time.Duration]{},
&sync.Mutex{},
},
reloadCron: mapXS[chan int]{
&mapX[chan int]{},
&sync.Mutex{},
},
isOff: make(chan int),
rMsgChan: make(chan connChan, 10),
newFetchItem: make(chan dataChan, 10),
connMap: mapXS[*websocket.Conn]{
&mapX[*websocket.Conn]{},
&sync.Mutex{},
},
2022-07-01 05:45:49 +00:00
}
}
2022-07-05 03:54:09 +00:00
func (f *fetchHandler) handle(conn string) {
key := "纪检"
2022-07-06 09:03:42 +00:00
if kk, ok := (*f.keyword.mapX)[conn]; ok && kk != "" {
2022-07-05 03:54:09 +00:00
key = kk
}
f.parsesDom(f.fetch(f.fetchUrl+key), conn)
2022-07-01 05:45:49 +00:00
}
func (f *fetchHandler) receiveMsg() {
for {
r := <-f.rMsgChan
2022-07-05 03:54:09 +00:00
switch r.msg.Action {
2022-07-01 05:45:49 +00:00
case "search":
2022-07-05 03:54:09 +00:00
if t, ok := r.msg.Data.(*setting); ok {
2022-07-06 09:03:42 +00:00
(*f.reloadCron.mapX)[r.conn] <- t.TimeStep
setMap[string](&f.keyword, r.conn, t.Keyword)
2022-07-05 03:54:09 +00:00
f.handle(r.conn)
2022-07-01 05:45:49 +00:00
}
}
}
}
func (f *fetchHandler) fetch(url string) *http.Response {
2022-07-05 03:54:09 +00:00
defer func() {
if r := recover(); r != nil {
log.Println(r)
}
}()
2022-07-01 05:45:49 +00:00
client := http.Client{
Transport: nil,
CheckRedirect: nil,
Jar: nil,
Timeout: 10 * time.Second,
}
req, _ := http.NewRequest("GET", url, nil)
req.Header.Add("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
req.Header.Add("accept-language", "zh-CN,zh;q=0.9")
req.Header.Add("cache-control", "no-cache")
req.Header.Add("connection", "keep-alive")
req.Header.Add("cookie", "BIDUPSID=844E3DCAA2EEBF5C872DC99B967B6B7B; PSTM=1655872163; BAIDUID=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; BD_UPN=123353; ORIGIN=2; ISSW=1; ISSW=1; BAIDUID_BFESS=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; ZFY=jWFAySgO:AoQfb6emY9vnmEdptVao:Anj0FFkp028wFws:C; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=3; COOKIE_SESSION=42_0_2_2_3_0_1_0_2_0_0_0_18_0_51_0_1655888428_0_1655888377%7C3%230_0_1655888377%7C1; BAIDU_WISE_UID=wapp_1655902298617_702; ZD_ENTRY=google; channel=baidusearch; baikeVisitId=b3b23509-9330-4d33-82ae-b8eb37895917; BA_HECTOR=8k2g2g218ga40181ak1hbgg1n14; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDSVRTM=1011; H_PS_PSSID=36550_36459_36673_36455_36453_36692_36165_36695_36697_36569_36075_36467_36316_36651")
req.Header.Add("referer", "http://news.baidu.com/")
req.Header.Add("sec-fetch-dest", "document")
req.Header.Add("sec-fetch-mode", "navigate")
req.Header.Add("sec-fetch-site", "cross-site")
req.Header.Add("sec-fetch-user", "?1")
req.Header.Add("upgrade-insecure-requests", "1")
req.Header.Add("user-agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36")
req.Header.Add("#sec-ch-ua", "\".Not/A)Brand\";v=\"99\", \"Google Chrome\";v=\"103\", \"Chromium\";v=\"103\"")
req.Header.Add("sec-ch-ua-mobile", "?0")
req.Header.Add("sec-ch-ua-platform", "\"Linux\"")
req.Header.Add("postman-token", "81407fbc-2b96-54a7-0193-f640156714ab")
response, err := client.Do(req)
2022-07-05 03:54:09 +00:00
2022-07-01 05:45:49 +00:00
if err != nil {
panic(err)
}
return response
}
2022-07-05 03:54:09 +00:00
func (f *fetchHandler) parsesDom(html *http.Response, conn string) {
defer func() {
if r := recover(); r != nil {
log.Println(r)
}
}()
2022-07-01 05:45:49 +00:00
doc, err := goquery.NewDocumentFromReader(html.Body)
if err != nil {
panic(err)
}
2022-07-05 03:54:09 +00:00
var newFetch []fetchData
ti := time.Now()
compile := regexp.MustCompile(`(\d+)`)
2022-07-01 05:45:49 +00:00
doc.Find("div[class=\"result-op c-container xpath-log new-pmd\"]").Each(func(i int, selection *goquery.Selection) {
data := fetchData{}
2022-07-05 03:54:09 +00:00
data.Url, _ = selection.Attr("mu")
2022-07-01 05:45:49 +00:00
t := selection.Find(".news-title-font_1xS-F").First()
2022-07-05 03:54:09 +00:00
data.Title = t.Text()
2022-07-06 09:03:42 +00:00
data.CreatedTime = ti.Format("2006-01-02 15:04:05")
2022-07-05 03:54:09 +00:00
data.Desc = selection.Find(".c-row .c-color-text").First().Text()
data.Date = selection.Find("span[class=\"c-color-gray2 c-font-normal c-gap-right-xsmall\"]").First().Text()
n := compile.FindAllStringSubmatch(data.Date, -1)
2022-07-06 02:55:28 +00:00
if nil != n {
nn, _ := strconv.Atoi(n[0][0])
if strings.Contains(data.Date, "小时") {
data.Date = ti.Add(-time.Duration(nn) * time.Hour).Format("2006-01-02 15:04")
}
if strings.Contains(data.Date, "分钟") {
data.Date = ti.Add(-time.Duration(nn) * time.Minute).Format("2006-01-02 15:04")
}
2022-07-05 03:54:09 +00:00
}
2022-07-06 02:55:28 +00:00
2022-07-05 03:54:09 +00:00
k := conn + "_" + data.Url + "_" + data.Title
2022-07-06 09:03:42 +00:00
if _, ok := (*f.hadFetchedMap.mapX)[k]; !ok {
2022-07-01 05:45:49 +00:00
f.hadFetchData = append(f.hadFetchData, data)
2022-07-06 09:03:42 +00:00
setMap(&f.hadFetchedMap, k, 1)
2022-07-01 05:45:49 +00:00
newFetch = append(newFetch, data)
}
})
2022-07-05 03:54:09 +00:00
if len(newFetch) > 0 {
f.newFetchItem <- dataChan{
conn: conn,
item: newFetch,
}
}
2022-07-01 05:45:49 +00:00
err = html.Body.Close()
if err != nil {
panic(err)
}
}
func (f *fetchHandler) sendFetchData() {
for {
data := <-f.newFetchItem
2022-07-05 03:54:09 +00:00
2022-07-06 09:03:42 +00:00
err := (*f.connMap.mapX)[data.conn].WriteJSON(message{
2022-07-01 05:45:49 +00:00
Status: true,
Action: "newData",
Message: "",
2022-07-05 03:54:09 +00:00
Data: data.item,
2022-07-01 05:45:49 +00:00
})
if err != nil {
log.Println(err)
}
}
}
2022-07-05 03:54:09 +00:00
func (f *fetchHandler) cronFetch(conn string, c chan int) {
2022-07-06 09:03:42 +00:00
step, ok := (*f.cronTime.mapX)[conn]
2022-07-05 03:54:09 +00:00
if !ok {
step = time.Second * 60
}
t := time.NewTicker(step)
2022-07-06 09:03:42 +00:00
if _, ok := (*f.cronTime.mapX)[conn]; !ok {
setMap(&f.reloadCron, conn, make(chan int))
2022-07-05 03:54:09 +00:00
}
2022-07-01 05:45:49 +00:00
defer t.Stop()
for {
select {
case <-t.C:
2022-07-05 03:54:09 +00:00
f.handle(conn)
2022-07-06 09:03:42 +00:00
case tt := <-(*f.reloadCron.mapX)[conn]:
setMap(&f.cronTime, conn, time.Duration(tt)*time.Second)
2022-07-05 03:54:09 +00:00
go f.cronFetch(conn, c)
2022-07-01 05:45:49 +00:00
return
2022-07-05 03:54:09 +00:00
case <-c:
close(c)
2022-07-01 05:45:49 +00:00
return
}
}
}
func main() {
2022-07-05 03:54:09 +00:00
h := newFetchHandler("https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&rsv_dl=ns_pc&word=")
2022-07-01 05:45:49 +00:00
router := gin.Default()
2022-07-05 03:54:09 +00:00
var upgrader = websocket.Upgrader{
CheckOrigin: func(r *http.Request) bool {
return true
},
}
go h.sendFetchData()
go h.receiveMsg()
2022-07-01 05:45:49 +00:00
router.LoadHTMLGlob("templates/*")
//router.LoadHTMLFiles("templates/template1.html", "templates/template2.html")
router.GET("/index", func(c *gin.Context) {
c.HTML(http.StatusOK, "index.gohtml", gin.H{
2022-07-05 03:54:09 +00:00
"title": "爬虫",
2022-07-01 05:45:49 +00:00
})
})
router.GET("ws", func(c *gin.Context) {
conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
if err != nil {
c.JSON(201, message{
Status: false,
Message: err.Error(),
Data: nil,
Action: "upgradeWs",
})
2022-07-05 03:54:09 +00:00
log.Println(err)
2022-07-01 05:45:49 +00:00
return
}
2022-07-05 03:54:09 +00:00
remote := conn.RemoteAddr().String()
2022-07-06 09:03:42 +00:00
if _, ok := (*h.connMap.mapX)[remote]; !ok {
setMap(&h.connMap, remote, conn)
2022-07-01 05:45:49 +00:00
}
2022-07-05 03:54:09 +00:00
cc := make(chan int)
go h.cronFetch(remote, cc)
go func() {
msg := connChan{
conn: remote,
msg: message{
Data: &setting{},
},
}
for {
err := conn.ReadJSON(&msg.msg)
if err != nil {
2022-07-06 09:03:42 +00:00
if _, ok := (*h.connMap.mapX)[remote]; ok && !websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway) {
delMap(&h.connMap, remote)
2022-07-05 03:54:09 +00:00
cc <- 1
return
}
log.Println(err)
} else {
h.rMsgChan <- msg
}
}
}()
2022-07-01 05:45:49 +00:00
})
2022-07-05 03:54:09 +00:00
err := router.Run(":8080")
if err != nil {
panic(err)
}
2022-07-01 05:45:49 +00:00
}