diff --git a/data/fetchdata.go b/data/fetchdata.go new file mode 100644 index 0000000..813b966 --- /dev/null +++ b/data/fetchdata.go @@ -0,0 +1,10 @@ +package data + +type FetchData struct { + Url string `json:"url"` + Title string `json:"title"` + Desc string `json:"desc"` + Date string `json:"date"` + CreatedTime string `json:"created_time"` + Source string `json:"source"` +} diff --git a/main.go b/main.go index 4e80b32..6f4ff23 100644 --- a/main.go +++ b/main.go @@ -3,43 +3,38 @@ package main import ( "embed" "errors" + "fmt" "github.com/PuerkitoBio/goquery" "github.com/gin-gonic/gin" "github.com/gorilla/websocket" + "github/fthvgb1/newsfetch/data" + "github/fthvgb1/newsfetch/newsource" "io/fs" "io/ioutil" "log" "net/http" + "net/http/cookiejar" + "net/url" "os/exec" "path/filepath" - "regexp" "runtime" - "strconv" "strings" "sync" "time" ) -type fetchData struct { - Url string `json:"url"` - Title string `json:"title"` - Desc string `json:"desc"` - Date string `json:"date"` - CreatedTime string `json:"created_time"` -} - type connChan struct { conn string msg message } type dataChan struct { conn string - item []fetchData + item []data.FetchData } type fetchHandler struct { fetchUrl string - hadFetchData []fetchData + hadFetchData []data.FetchData cronTime mapXS[time.Duration] keyword mapXS[string] hadFetchedMap mapXS[int] @@ -144,7 +139,14 @@ func (f *fetchHandler) handle(conn string) { if kk, ok := (*f.keyword.mapX)[conn]; ok && kk != "" { key = kk } - f.parsesDom(f.fetch(f.fetchUrl+key), conn) + for _, source := range newsource.GetSource() { + r := f.fetch2(source, key) + if strings.ToUpper(source.Type) == "HTML" { + f.parsesDom(r, conn, source) + } else { + f.parseAjax(r, source, conn) + } + } } func (f *fetchHandler) receiveMsg() { @@ -161,6 +163,68 @@ func (f *fetchHandler) receiveMsg() { } } +func (f *fetchHandler) fetch2(source newsource.Source, key string) *http.Response { + jar, _ := cookiejar.New(nil) + client := http.Client{ + Transport: nil, + CheckRedirect: nil, + Jar: jar, + Timeout: 10 * time.Second, + } + searchUrl := source.SearchUrl + source.Method = strings.ToUpper(source.Method) + if source.Method == "GET" { + if !strings.Contains(searchUrl, "?") { + searchUrl += "?" + source.KeywordField + "=" + url.QueryEscape(key) + } else { + searchUrl += "&" + source.KeywordField + "=" + url.QueryEscape(key) + } + } + var req *http.Request + if source.Method == "POST" { + body := source.KeywordField + "=" + key + if nil != source.ExternParam { + body += "&" + for s, s2 := range source.ExternParam { + body += s + "=" + s2 + "&" + } + body = strings.TrimRight(body, "&") + } + req, _ = http.NewRequest(source.Method, searchUrl, strings.NewReader(body)) + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + } else { + req, _ = http.NewRequest(source.Method, searchUrl, nil) + } + if source.Header != nil { + for s, s2 := range source.Header { + req.Header.Set(s, s2) + } + } + if source.HeaderFun != nil { + source.HeaderFun(req) + } + client.CheckRedirect = func(req *http.Request, via []*http.Request) error { + if len(via) > 0 && via[0].URL.Scheme == "https" && req.URL.Scheme != "https" { + lastHop := via[len(via)-1].URL + return fmt.Errorf("redirected from secure URL %s to insecure URL %s", lastHop, req.URL) + } + + // Go's http.DefaultClient allows 10 redirects before returning an error. + // The securityPreservingHTTPClient also uses this default policy to avoid + // Go command hangs. + if len(via) >= 3 { + return errors.New("stopped after 3 redirects") + } + return nil + } + response, err := client.Do(req) + + if err != nil { + panic(err) + } + return response +} + func (f *fetchHandler) fetch(url string) *http.Response { defer func() { if r := recover(); r != nil { @@ -200,7 +264,37 @@ func (f *fetchHandler) fetch(url string) *http.Response { return response } -func (f *fetchHandler) parsesDom(html *http.Response, conn string) { +func (f *fetchHandler) parseAjax(response *http.Response, source newsource.Source, conn string) { + defer func() { + if r := recover(); r != nil { + log.Println(r) + } + }() + var newFetch []data.FetchData + source.AjaxDealFun(&newFetch, response) + if len(newFetch) > 0 { + + for i, fetchData := range newFetch { + k := conn + "_" + fetchData.Url + "_" + fetchData.Title + if _, ok := (*f.hadFetchedMap.mapX)[k]; !ok { + f.hadFetchData = append(f.hadFetchData, fetchData) + setMap(&f.hadFetchedMap, k, 1) + } else { + newFetch = newFetch[:i+copy(newFetch[i:], newFetch[i+1:])] // 删除中间1个元素 + } + } + f.newFetchItem <- dataChan{ + conn: conn, + item: newFetch, + } + } + err := response.Body.Close() + if err != nil { + panic(err) + } +} + +func (f *fetchHandler) parsesDom(html *http.Response, conn string, source newsource.Source) { defer func() { if r := recover(); r != nil { log.Println(r) @@ -210,34 +304,18 @@ func (f *fetchHandler) parsesDom(html *http.Response, conn string) { if err != nil { panic(err) } - var newFetch []fetchData - ti := time.Now() - compile := regexp.MustCompile(`(\d+)`) - - doc.Find("div[class=\"result-op c-container xpath-log new-pmd\"]").Each(func(i int, selection *goquery.Selection) { - data := fetchData{} - data.Url, _ = selection.Attr("mu") - t := selection.Find(".news-title-font_1xS-F").First() - data.Title = t.Text() - data.CreatedTime = ti.Format("2006-01-02 15:04:05") - data.Desc = selection.Find(".c-row .c-color-text").First().Text() - data.Date = selection.Find("span[class=\"c-color-gray2 c-font-normal c-gap-right-xsmall\"]").First().Text() - n := compile.FindAllStringSubmatch(data.Date, -1) - if nil != n { - nn, _ := strconv.Atoi(n[0][0]) - if strings.Contains(data.Date, "小时") { - data.Date = ti.Add(-time.Duration(nn) * time.Hour).Format("2006-01-02 15:04") - } - if strings.Contains(data.Date, "分钟") { - data.Date = ti.Add(-time.Duration(nn) * time.Minute).Format("2006-01-02 15:04") - } + var newFetch []data.FetchData + nowDate := time.Now().Format("2006-01-02 15:04:05") + doc.Find(source.ListQuery).Each(func(i int, selection *goquery.Selection) { + fetchData := data.FetchData{ + CreatedTime: nowDate, } - - k := conn + "_" + data.Url + "_" + data.Title + source.QueryHandler(i, selection, &fetchData) + k := conn + "_" + fetchData.Url + "_" + fetchData.Title if _, ok := (*f.hadFetchedMap.mapX)[k]; !ok { - f.hadFetchData = append(f.hadFetchData, data) + f.hadFetchData = append(f.hadFetchData, fetchData) setMap(&f.hadFetchedMap, k, 1) - newFetch = append(newFetch, data) + newFetch = append(newFetch, fetchData) } }) if len(newFetch) > 0 { @@ -254,13 +332,13 @@ func (f *fetchHandler) parsesDom(html *http.Response, conn string) { func (f *fetchHandler) sendFetchData() { for { - data := <-f.newFetchItem + dataFetch := <-f.newFetchItem - err := (*f.connMap.mapX)[data.conn].WriteJSON(message{ + err := (*f.connMap.mapX)[dataFetch.conn].WriteJSON(message{ Status: true, Action: "newData", Message: "", - Data: data.item, + Data: dataFetch.item, }) if err != nil { log.Println(err) diff --git a/newsource/ccdipager.go b/newsource/ccdipager.go new file mode 100644 index 0000000..587e8ff --- /dev/null +++ b/newsource/ccdipager.go @@ -0,0 +1,35 @@ +package newsource + +import ( + "regexp" + "strings" +) + +type CCDIPAGERInfo struct { + BM string `json:"BM"` + BC string `json:"BC"` + DOCAUTHOR string `json:"DOCAUTHOR"` + DOCPUBTIME string `json:"DOCPUBTIME"` + DOCTITLE string `json:"DOCTITLE"` + TXS string `json:"TXS"` + ZBGUID string `json:"ZB_GUID"` + ZBSOURCESITE string `json:"ZB_SOURCE_SITE"` + YT string `json:"YT,omitempty"` + IRCONTENT string `json:"IR_CONTENT"` + FB string `json:"FB,omitempty"` +} +type CCDIPAGERResponse struct { + Data CCDIPAGERData `json:"data"` + Code bool `json:"code"` + Msg string `json:"msg"` +} +type CCDIPAGERData struct { + Info []CCDIPAGERInfo `json:"info"` + NextPage int `json:"nextPage"` +} + +func stripTags(content string) string { + content = strings.Replace(content, " ", "", -1) + re := regexp.MustCompile(`<(.|\n)*?>`) + return re.ReplaceAllString(content, "") +} diff --git a/newsource/model.go b/newsource/model.go new file mode 100644 index 0000000..16cff21 --- /dev/null +++ b/newsource/model.go @@ -0,0 +1,173 @@ +package newsource + +import ( + "encoding/json" + "fmt" + "github.com/PuerkitoBio/goquery" + "github/fthvgb1/newsfetch/data" + "io/ioutil" + "log" + "net/http" + "regexp" + "strconv" + "strings" + "time" + "unicode/utf8" +) + +type Source struct { + Name string + SearchUrl string + Method string + Type string + KeywordField string + ListQuery string + QueryHandler func(i int, selection *goquery.Selection, fetchData *data.FetchData) + AjaxHandler func(fetchData *data.FetchData) + Header map[string]string + HeaderFun func(r *http.Request) + ExternParam map[string]string + AjaxDealFun func(*[]data.FetchData, *http.Response) +} + +func GetSource() []Source { + ti := time.Now() + compile := regexp.MustCompile(`(\d+)`) + nowDate := ti.Format("2006-01-02 15:04:05") + return []Source{ + { + Name: "中央纪委监察部", + SearchUrl: "https://www.ccdi.gov.cn/was5/web/search", + Method: "post", + ListQuery: ".center_box0 li", + QueryHandler: func(i int, selection *goquery.Selection, data *data.FetchData) { + data.Url, _ = selection.Find("a").First().Attr("href") + data.Title = selection.Find("b.title").First().Text() + data.Date = selection.Find("span.time").First().Text() + }, + Type: "html", + KeywordField: "searchword", + ExternParam: map[string]string{ + "channelid": "298814", + "orderby": "RELEVANCE", + }, + HeaderFun: func(req *http.Request) { + req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") + req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9") + req.Header.Add("Cache-Control", "max-age=0") + req.Header.Add("Connection", "keep-alive") + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + req.Header.Add("Origin", "https://www.ccdi.gov.cn") + req.Header.Add("Referer", "https://www.ccdi.gov.cn/") + req.Header.Add("Sec-Fetch-Dest", "document") + req.Header.Add("Sec-Fetch-Mode", "navigate") + req.Header.Add("Sec-Fetch-Site", "same-origin") + req.Header.Add("Sec-Fetch-User", "?1") + req.Header.Add("Upgrade-Insecure-Requests", "1") + req.Header.Add("User-Agent", "Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36") + req.Header.Add("sec-ch-ua", "\".Not/A)Brand\";v=\"99\", \"Google Chrome\";v=\"103\", \"Chromium\";v=\"103\"") + req.Header.Add("sec-ch-ua-mobile", "?0") + req.Header.Add("sec-ch-ua-platform", "\"Windows\"") + }, + }, + { + Name: "百度新闻", + SearchUrl: "https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&rsv_dl=ns_pc", + Method: "get", + ListQuery: "div[class=\"result-op c-container xpath-log new-pmd\"]", + QueryHandler: func(i int, selection *goquery.Selection, data *data.FetchData) { + data.Url, _ = selection.Attr("mu") + t := selection.Find(".news-title-font_1xS-F").First() + data.Title = t.Text() + data.Desc = selection.Find(".c-row .c-color-text").First().Text() + data.Date = selection.Find("span[class=\"c-color-gray2 c-font-normal c-gap-right-xsmall\"]").First().Text() + n := compile.FindAllStringSubmatch(data.Date, -1) + if nil != n { + nn, _ := strconv.Atoi(n[0][0]) + if strings.Contains(data.Date, "小时") { + data.Date = ti.Add(-time.Duration(nn) * time.Hour).Format("2006-01-02 15:04") + } + if strings.Contains(data.Date, "分钟") { + data.Date = ti.Add(-time.Duration(nn) * time.Minute).Format("2006-01-02 15:04") + } + } + if strings.Contains(data.Date, "昨天") { + data.Date = ti.Add(-time.Duration(24) * time.Hour).Format("2006-01-02 15:04") + } + if strings.Contains(data.Date, "前天") { + data.Date = ti.Add(-time.Duration(48) * time.Hour).Format("2006-01-02 15:04") + } + }, + Type: "html", + KeywordField: "word", + HeaderFun: func(req *http.Request) { + req.Header.Add("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") + req.Header.Add("accept-language", "zh-CN,zh;q=0.9") + req.Header.Add("cache-control", "no-cache") + req.Header.Add("connection", "keep-alive") + req.Header.Add("cookie", "BIDUPSID=844E3DCAA2EEBF5C872DC99B967B6B7B; PSTM=1655872163; BAIDUID=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; BD_UPN=123353; ORIGIN=2; ISSW=1; ISSW=1; BAIDUID_BFESS=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; ZFY=jWFAySgO:AoQfb6emY9vnmEdptVao:Anj0FFkp028wFws:C; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=3; COOKIE_SESSION=42_0_2_2_3_0_1_0_2_0_0_0_18_0_51_0_1655888428_0_1655888377%7C3%230_0_1655888377%7C1; BAIDU_WISE_UID=wapp_1655902298617_702; ZD_ENTRY=google; channel=baidusearch; baikeVisitId=b3b23509-9330-4d33-82ae-b8eb37895917; BA_HECTOR=8k2g2g218ga40181ak1hbgg1n14; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDSVRTM=1011; H_PS_PSSID=36550_36459_36673_36455_36453_36692_36165_36695_36697_36569_36075_36467_36316_36651") + req.Header.Add("referer", "http://news.baidu.com/") + req.Header.Add("sec-fetch-dest", "document") + req.Header.Add("sec-fetch-mode", "navigate") + req.Header.Add("sec-fetch-site", "cross-site") + req.Header.Add("sec-fetch-user", "?1") + req.Header.Add("upgrade-insecure-requests", "1") + req.Header.Add("user-agent", "Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36") + req.Header.Add("#sec-ch-ua", "\".Not/A)Brand\";v=\"99\", \"Google Chrome\";v=\"103\", \"Chromium\";v=\"103\"") + req.Header.Add("sec-ch-ua-mobile", "?0") + req.Header.Add("sec-ch-ua-platform", "\"Windows\"") + req.Header.Add("postman-token", "81407fbc-2b96-54a7-0193-f640156714ab") + + }, + }, + { + Name: "中国纪检监察报", + SearchUrl: "https://jjjcb.ccdi.gov.cn/reader/layout/getSearch.do?beginDocPubTime=&searchField=3&author=&mc=%E4%B8%AD%E5%9B%BD%E7%BA%AA%E6%A3%80%E7%9B%91%E5%AF%9F%E6%8A%A5&endDocPubTime=&pageNo=1&pageSize=10&flag=0&sort=0&asc=1", + Method: "get", + Type: "ajax", + ListQuery: "", + KeywordField: "keyword", + HeaderFun: func(req *http.Request) { + req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") + req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9") + req.Header.Add("Cache-Control", "max-age=0") + req.Header.Add("Connection", "keep-alive") + req.Header.Add("Upgrade-Insecure-Requests", "1") + req.Header.Add("User-Agent", "Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36") + }, + AjaxDealFun: func(fetchData *[]data.FetchData, response *http.Response) { + bytes, err := ioutil.ReadAll(response.Body) + if err != nil { + log.Println(err) + return + } + var r CCDIPAGERResponse + err = json.Unmarshal(bytes, &r) + if err != nil { + log.Println(err) + return + } + for _, v := range r.Data.Info { + if "" == v.YT { + continue + } + desc := stripTags(v.IRCONTENT) + l := utf8.RuneCountInString(desc) + if l > 30 { + l = 30 + } + desc = string([]rune(desc)[:30]) + d := data.FetchData{ + Url: fmt.Sprintf("https://jjjcb.ccdi.gov.cn/epaper/index.html?guid=%s", v.ZBGUID), + Title: v.YT, + Desc: desc, + Date: v.DOCPUBTIME, + CreatedTime: nowDate, + Source: "中国纪检监察报", + } + *fetchData = append(*fetchData, d) + } + }, + }, + } +}