newfetch/newsource/model.go

174 lines
7.3 KiB
Go
Raw Normal View History

2022-07-23 18:01:07 +00:00
package newsource
import (
"encoding/json"
"fmt"
"github.com/PuerkitoBio/goquery"
"github/fthvgb1/newsfetch/data"
"io/ioutil"
"log"
"net/http"
"regexp"
"strconv"
"strings"
"time"
"unicode/utf8"
)
type Source struct {
Name string
SearchUrl string
Method string
Type string
KeywordField string
ListQuery string
QueryHandler func(i int, selection *goquery.Selection, fetchData *data.FetchData)
AjaxHandler func(fetchData *data.FetchData)
Header map[string]string
HeaderFun func(r *http.Request)
ExternParam map[string]string
AjaxDealFun func(*[]data.FetchData, *http.Response)
}
func GetSource() []Source {
ti := time.Now()
compile := regexp.MustCompile(`(\d+)`)
nowDate := ti.Format("2006-01-02 15:04:05")
return []Source{
{
Name: "中央纪委监察部",
SearchUrl: "https://www.ccdi.gov.cn/was5/web/search",
Method: "post",
ListQuery: ".center_box0 li",
QueryHandler: func(i int, selection *goquery.Selection, data *data.FetchData) {
data.Url, _ = selection.Find("a").First().Attr("href")
data.Title = selection.Find("b.title").First().Text()
data.Date = selection.Find("span.time").First().Text()
},
Type: "html",
KeywordField: "searchword",
ExternParam: map[string]string{
"channelid": "298814",
"orderby": "RELEVANCE",
},
HeaderFun: func(req *http.Request) {
req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9")
req.Header.Add("Cache-Control", "max-age=0")
req.Header.Add("Connection", "keep-alive")
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
req.Header.Add("Origin", "https://www.ccdi.gov.cn")
req.Header.Add("Referer", "https://www.ccdi.gov.cn/")
req.Header.Add("Sec-Fetch-Dest", "document")
req.Header.Add("Sec-Fetch-Mode", "navigate")
req.Header.Add("Sec-Fetch-Site", "same-origin")
req.Header.Add("Sec-Fetch-User", "?1")
req.Header.Add("Upgrade-Insecure-Requests", "1")
req.Header.Add("User-Agent", "Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36")
req.Header.Add("sec-ch-ua", "\".Not/A)Brand\";v=\"99\", \"Google Chrome\";v=\"103\", \"Chromium\";v=\"103\"")
req.Header.Add("sec-ch-ua-mobile", "?0")
req.Header.Add("sec-ch-ua-platform", "\"Windows\"")
},
},
{
Name: "百度新闻",
SearchUrl: "https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&rsv_dl=ns_pc",
Method: "get",
ListQuery: "div[class=\"result-op c-container xpath-log new-pmd\"]",
QueryHandler: func(i int, selection *goquery.Selection, data *data.FetchData) {
data.Url, _ = selection.Attr("mu")
t := selection.Find(".news-title-font_1xS-F").First()
data.Title = t.Text()
data.Desc = selection.Find(".c-row .c-color-text").First().Text()
data.Date = selection.Find("span[class=\"c-color-gray2 c-font-normal c-gap-right-xsmall\"]").First().Text()
n := compile.FindAllStringSubmatch(data.Date, -1)
if nil != n {
nn, _ := strconv.Atoi(n[0][0])
if strings.Contains(data.Date, "小时") {
data.Date = ti.Add(-time.Duration(nn) * time.Hour).Format("2006-01-02 15:04")
}
if strings.Contains(data.Date, "分钟") {
data.Date = ti.Add(-time.Duration(nn) * time.Minute).Format("2006-01-02 15:04")
}
}
if strings.Contains(data.Date, "昨天") {
data.Date = ti.Add(-time.Duration(24) * time.Hour).Format("2006-01-02 15:04")
}
if strings.Contains(data.Date, "前天") {
data.Date = ti.Add(-time.Duration(48) * time.Hour).Format("2006-01-02 15:04")
}
},
Type: "html",
KeywordField: "word",
HeaderFun: func(req *http.Request) {
req.Header.Add("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
req.Header.Add("accept-language", "zh-CN,zh;q=0.9")
req.Header.Add("cache-control", "no-cache")
req.Header.Add("connection", "keep-alive")
req.Header.Add("cookie", "BIDUPSID=844E3DCAA2EEBF5C872DC99B967B6B7B; PSTM=1655872163; BAIDUID=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; BD_UPN=123353; ORIGIN=2; ISSW=1; ISSW=1; BAIDUID_BFESS=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; ZFY=jWFAySgO:AoQfb6emY9vnmEdptVao:Anj0FFkp028wFws:C; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=3; COOKIE_SESSION=42_0_2_2_3_0_1_0_2_0_0_0_18_0_51_0_1655888428_0_1655888377%7C3%230_0_1655888377%7C1; BAIDU_WISE_UID=wapp_1655902298617_702; ZD_ENTRY=google; channel=baidusearch; baikeVisitId=b3b23509-9330-4d33-82ae-b8eb37895917; BA_HECTOR=8k2g2g218ga40181ak1hbgg1n14; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDSVRTM=1011; H_PS_PSSID=36550_36459_36673_36455_36453_36692_36165_36695_36697_36569_36075_36467_36316_36651")
req.Header.Add("referer", "http://news.baidu.com/")
req.Header.Add("sec-fetch-dest", "document")
req.Header.Add("sec-fetch-mode", "navigate")
req.Header.Add("sec-fetch-site", "cross-site")
req.Header.Add("sec-fetch-user", "?1")
req.Header.Add("upgrade-insecure-requests", "1")
req.Header.Add("user-agent", "Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36")
req.Header.Add("#sec-ch-ua", "\".Not/A)Brand\";v=\"99\", \"Google Chrome\";v=\"103\", \"Chromium\";v=\"103\"")
req.Header.Add("sec-ch-ua-mobile", "?0")
req.Header.Add("sec-ch-ua-platform", "\"Windows\"")
req.Header.Add("postman-token", "81407fbc-2b96-54a7-0193-f640156714ab")
},
},
{
Name: "中国纪检监察报",
SearchUrl: "https://jjjcb.ccdi.gov.cn/reader/layout/getSearch.do?beginDocPubTime=&searchField=3&author=&mc=%E4%B8%AD%E5%9B%BD%E7%BA%AA%E6%A3%80%E7%9B%91%E5%AF%9F%E6%8A%A5&endDocPubTime=&pageNo=1&pageSize=10&flag=0&sort=0&asc=1",
Method: "get",
Type: "ajax",
ListQuery: "",
KeywordField: "keyword",
HeaderFun: func(req *http.Request) {
req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9")
req.Header.Add("Cache-Control", "max-age=0")
req.Header.Add("Connection", "keep-alive")
req.Header.Add("Upgrade-Insecure-Requests", "1")
req.Header.Add("User-Agent", "Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36")
},
AjaxDealFun: func(fetchData *[]data.FetchData, response *http.Response) {
bytes, err := ioutil.ReadAll(response.Body)
if err != nil {
log.Println(err)
return
}
var r CCDIPAGERResponse
err = json.Unmarshal(bytes, &r)
if err != nil {
log.Println(err)
return
}
for _, v := range r.Data.Info {
if "" == v.YT {
continue
}
desc := stripTags(v.IRCONTENT)
l := utf8.RuneCountInString(desc)
if l > 30 {
l = 30
}
desc = string([]rune(desc)[:30])
d := data.FetchData{
Url: fmt.Sprintf("https://jjjcb.ccdi.gov.cn/epaper/index.html?guid=%s", v.ZBGUID),
Title: v.YT,
Desc: desc,
Date: v.DOCPUBTIME,
CreatedTime: nowDate,
Source: "中国纪检监察报",
}
*fetchData = append(*fetchData, d)
}
},
},
}
}