408 lines
16 KiB
Go
408 lines
16 KiB
Go
package newsource
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github/fthvgb1/newsfetch/data"
|
|
"io/ioutil"
|
|
"log"
|
|
"net/http"
|
|
"reflect"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
type Source struct {
|
|
Name string
|
|
SearchUrl string
|
|
Method string
|
|
Type string
|
|
KeywordField string
|
|
ListQuery string
|
|
QueryHandler func(i int, selection *goquery.Selection, fetchData *data.FetchData)
|
|
AjaxHandler func(fetchData *data.FetchData)
|
|
Header map[string]string
|
|
HeaderFun func(r *http.Request)
|
|
ExternParam map[string]string
|
|
AjaxDealFun func(*[]data.FetchData, *http.Response)
|
|
Target reflect.Type
|
|
UnmarshalerFun func([]byte) interface{}
|
|
AjaxSimpleDeal func(interface{}, *[]data.FetchData)
|
|
IsJson bool
|
|
}
|
|
|
|
func StripTags(content string) string {
|
|
re := regexp.MustCompile(`<(.|\n)*?>`)
|
|
return re.ReplaceAllString(content, "")
|
|
}
|
|
|
|
func GetSource() []Source {
|
|
ti := time.Now()
|
|
compile := regexp.MustCompile(`(\d+)`)
|
|
nowDate := ti.Format("2006-01-02 15:04:05")
|
|
return []Source{
|
|
{
|
|
Name: "中央纪委国家监委网站",
|
|
SearchUrl: "https://www.ccdi.gov.cn/was5/web/search",
|
|
Method: "post",
|
|
ListQuery: ".center_box0 li",
|
|
QueryHandler: func(i int, selection *goquery.Selection, data *data.FetchData) {
|
|
data.Url, _ = selection.Find("a").First().Attr("href")
|
|
data.Title = selection.Find("b.title").First().Text()
|
|
data.Date = selection.Find("span.time").First().Text()
|
|
},
|
|
Type: "html",
|
|
KeywordField: "searchword",
|
|
ExternParam: map[string]string{
|
|
"channelid": "298814",
|
|
"orderby": "RELEVANCE",
|
|
},
|
|
HeaderFun: func(req *http.Request) {
|
|
req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
|
|
req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9")
|
|
req.Header.Add("Cache-Control", "max-age=0")
|
|
req.Header.Add("Connection", "keep-alive")
|
|
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
|
|
req.Header.Add("Origin", "https://www.ccdi.gov.cn")
|
|
req.Header.Add("Referer", "https://www.ccdi.gov.cn/")
|
|
req.Header.Add("Sec-Fetch-Dest", "document")
|
|
req.Header.Add("Sec-Fetch-Mode", "navigate")
|
|
req.Header.Add("Sec-Fetch-Site", "same-origin")
|
|
req.Header.Add("Sec-Fetch-User", "?1")
|
|
req.Header.Add("Upgrade-Insecure-Requests", "1")
|
|
req.Header.Add("User-Agent", "Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36")
|
|
req.Header.Add("sec-ch-ua", "\".Not/A)Brand\";v=\"99\", \"Google Chrome\";v=\"103\", \"Chromium\";v=\"103\"")
|
|
req.Header.Add("sec-ch-ua-mobile", "?0")
|
|
req.Header.Add("sec-ch-ua-platform", "\"Windows\"")
|
|
},
|
|
},
|
|
{
|
|
Name: "百度新闻",
|
|
SearchUrl: "https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&rsv_dl=ns_pc",
|
|
Method: "get",
|
|
ListQuery: "div[class=\"result-op c-container xpath-log new-pmd\"]",
|
|
QueryHandler: func(i int, selection *goquery.Selection, data *data.FetchData) {
|
|
data.Url, _ = selection.Attr("mu")
|
|
t := selection.Find(".news-title-font_1xS-F").First()
|
|
data.Title = t.Text()
|
|
data.Desc = selection.Find(".c-row .c-color-text").First().Text()
|
|
data.Source = selection.Find("span[aria-label*=\"新闻来源\"]").First().Text()
|
|
data.Date = selection.Find("span[class=\"c-color-gray2 c-font-normal c-gap-right-xsmall\"]").First().Text()
|
|
n := compile.FindAllStringSubmatch(data.Date, -1)
|
|
if nil != n {
|
|
nn, _ := strconv.Atoi(n[0][0])
|
|
if strings.Contains(data.Date, "小时") {
|
|
data.Date = ti.Add(-time.Duration(nn) * time.Hour).Format("2006-01-02 15:04")
|
|
}
|
|
if strings.Contains(data.Date, "分钟") {
|
|
data.Date = ti.Add(-time.Duration(nn) * time.Minute).Format("2006-01-02 15:04")
|
|
}
|
|
}
|
|
if strings.Contains(data.Date, "昨天") {
|
|
data.Date = ti.Add(-time.Duration(24) * time.Hour).Format("2006-01-02 15:04")
|
|
}
|
|
if strings.Contains(data.Date, "前天") {
|
|
data.Date = ti.Add(-time.Duration(48) * time.Hour).Format("2006-01-02 15:04")
|
|
}
|
|
},
|
|
Type: "html",
|
|
KeywordField: "word",
|
|
HeaderFun: func(req *http.Request) {
|
|
req.Header.Add("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
|
|
req.Header.Add("accept-language", "zh-CN,zh;q=0.9")
|
|
req.Header.Add("cache-control", "no-cache")
|
|
req.Header.Add("connection", "keep-alive")
|
|
req.Header.Add("cookie", "BIDUPSID=844E3DCAA2EEBF5C872DC99B967B6B7B; PSTM=1655872163; BAIDUID=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; BD_UPN=123353; ORIGIN=2; ISSW=1; ISSW=1; BAIDUID_BFESS=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; ZFY=jWFAySgO:AoQfb6emY9vnmEdptVao:Anj0FFkp028wFws:C; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=3; COOKIE_SESSION=42_0_2_2_3_0_1_0_2_0_0_0_18_0_51_0_1655888428_0_1655888377%7C3%230_0_1655888377%7C1; BAIDU_WISE_UID=wapp_1655902298617_702; ZD_ENTRY=google; channel=baidusearch; baikeVisitId=b3b23509-9330-4d33-82ae-b8eb37895917; BA_HECTOR=8k2g2g218ga40181ak1hbgg1n14; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDSVRTM=1011; H_PS_PSSID=36550_36459_36673_36455_36453_36692_36165_36695_36697_36569_36075_36467_36316_36651")
|
|
req.Header.Add("referer", "http://news.baidu.com/")
|
|
req.Header.Add("sec-fetch-dest", "document")
|
|
req.Header.Add("sec-fetch-mode", "navigate")
|
|
req.Header.Add("sec-fetch-site", "cross-site")
|
|
req.Header.Add("sec-fetch-user", "?1")
|
|
req.Header.Add("upgrade-insecure-requests", "1")
|
|
req.Header.Add("user-agent", "Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36")
|
|
req.Header.Add("#sec-ch-ua", "\".Not/A)Brand\";v=\"99\", \"Google Chrome\";v=\"103\", \"Chromium\";v=\"103\"")
|
|
req.Header.Add("sec-ch-ua-mobile", "?0")
|
|
req.Header.Add("sec-ch-ua-platform", "\"Windows\"")
|
|
req.Header.Add("postman-token", "81407fbc-2b96-54a7-0193-f640156714ab")
|
|
|
|
},
|
|
},
|
|
{
|
|
Name: "中国纪检监察报",
|
|
SearchUrl: "https://jjjcb.ccdi.gov.cn/reader/layout/getSearch.do?beginDocPubTime=&searchField=3&author=&mc=%E4%B8%AD%E5%9B%BD%E7%BA%AA%E6%A3%80%E7%9B%91%E5%AF%9F%E6%8A%A5&endDocPubTime=&pageNo=1&pageSize=10&flag=0&sort=0&asc=1",
|
|
Method: "get",
|
|
Type: "ajax",
|
|
ListQuery: "",
|
|
KeywordField: "keyword",
|
|
HeaderFun: func(req *http.Request) {
|
|
req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
|
|
req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9")
|
|
req.Header.Add("Cache-Control", "max-age=0")
|
|
req.Header.Add("Connection", "keep-alive")
|
|
req.Header.Add("Upgrade-Insecure-Requests", "1")
|
|
req.Header.Add("User-Agent", "Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36")
|
|
},
|
|
AjaxDealFun: func(fetchData *[]data.FetchData, response *http.Response) {
|
|
bytes, err := ioutil.ReadAll(response.Body)
|
|
if err != nil {
|
|
log.Println(err)
|
|
return
|
|
}
|
|
var r CCDIPAGERResponse
|
|
err = json.Unmarshal(bytes, &r)
|
|
if err != nil {
|
|
log.Println(err)
|
|
return
|
|
}
|
|
for _, v := range r.Data.Info {
|
|
if "" == v.YT {
|
|
continue
|
|
}
|
|
desc := StripTags(v.IRCONTENT)
|
|
l := utf8.RuneCountInString(desc)
|
|
if l > 30 {
|
|
l = 30
|
|
}
|
|
desc = string([]rune(desc)[:30])
|
|
d := data.FetchData{
|
|
Url: fmt.Sprintf("https://jjjcb.ccdi.gov.cn/epaper/index.html?guid=%s", v.ZBGUID),
|
|
Title: v.DOCTITLE,
|
|
Desc: desc,
|
|
Date: v.DOCPUBTIME,
|
|
CreatedTime: nowDate,
|
|
Source: "中国纪检监察报",
|
|
}
|
|
*fetchData = append(*fetchData, d)
|
|
}
|
|
},
|
|
},
|
|
{
|
|
Name: "中新网搜索",
|
|
SearchUrl: "https://sou.chinanews.com.cn/search.do",
|
|
KeywordField: "q",
|
|
Method: "get",
|
|
Type: "html",
|
|
ListQuery: "#news_list table",
|
|
QueryHandler: func(i int, selection *goquery.Selection, fetchData *data.FetchData) {
|
|
t := selection.Find(".news_item a").First()
|
|
fetchData.Title = t.Text()
|
|
fetchData.Url, _ = t.Attr("href")
|
|
fetchData.Desc = selection.Find(".news_content").First().Text()
|
|
tt := selection.Find(".news_other").First().Text()
|
|
fet := strings.Split(tt, "html")
|
|
fetchData.Date = fet[len(fet)-1]
|
|
},
|
|
},
|
|
{
|
|
Name: "新浪新闻搜索",
|
|
SearchUrl: "https://search.sina.com.cn/?range=title&c=news&time=&ie=utf-8",
|
|
Method: "get",
|
|
Type: "html",
|
|
KeywordField: "q",
|
|
ListQuery: "#wrap .box-result",
|
|
Header: map[string]string{
|
|
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
|
|
"authority": "search.sina.com.cn",
|
|
"referer": "https://news.sina.com.cn/rol",
|
|
},
|
|
QueryHandler: func(i int, selection *goquery.Selection, fetchData *data.FetchData) {
|
|
t := selection.Find("h2>a").First()
|
|
fetchData.Title = t.Text()
|
|
fetchData.Url, _ = t.Attr("href")
|
|
fetchData.Desc = selection.Find(".r-info .content").Text()
|
|
s := selection.Find("h2 >.fgray_time").First().Text()
|
|
ll := strings.Fields(s)
|
|
if len(ll) > 2 {
|
|
fetchData.Date = ll[1] + " " + ll[2]
|
|
fetchData.Source = ll[0]
|
|
} else {
|
|
fetchData.Date = ll[1]
|
|
fetchData.Source = ll[0]
|
|
}
|
|
|
|
},
|
|
},
|
|
{
|
|
Name: "联合早报",
|
|
SearchUrl: "https://www.zaobao.com/search?pageNo=1&pageSize=10",
|
|
Method: "get",
|
|
Type: "ajax",
|
|
Header: map[string]string{
|
|
"authority": "www.zaobao.com",
|
|
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
|
|
},
|
|
KeywordField: "keywords",
|
|
AjaxDealFun: func(i *[]data.FetchData, response *http.Response) {
|
|
bytes, err := ioutil.ReadAll(response.Body)
|
|
if err != nil {
|
|
log.Println(err)
|
|
return
|
|
}
|
|
var r ZaoBaoResponse
|
|
err = json.Unmarshal(bytes, &r)
|
|
if err != nil {
|
|
log.Println(err)
|
|
return
|
|
}
|
|
for _, datum := range r.Result.Data {
|
|
t := time.UnixMilli(datum.PublicationDate)
|
|
v := data.FetchData{
|
|
Url: "https://www.zaobao.com" + datum.URL,
|
|
Title: datum.Title,
|
|
Desc: datum.ContentPreview,
|
|
Date: t.Format("2006-01-02 15:04:05"),
|
|
}
|
|
*i = append(*i, v)
|
|
}
|
|
},
|
|
},
|
|
{
|
|
Name: "新京报",
|
|
SearchUrl: "https://s.bjnews.com.cn/bjnews/getlist?from=bw&page=1&orderby=0",
|
|
KeywordField: "bwsk",
|
|
Type: "ajax",
|
|
Method: "get",
|
|
Header: map[string]string{
|
|
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
|
|
},
|
|
Target: reflect.TypeOf(BjNewsResponse{}),
|
|
AjaxSimpleDeal: func(rr interface{}, i *[]data.FetchData) {
|
|
r := rr.(BjNewsResponse)
|
|
for _, v := range r.Data.Data {
|
|
item := data.FetchData{
|
|
Url: v.Source.DetailURL.PcURL,
|
|
Title: v.Source.Title,
|
|
Desc: v.Highlight.Desc,
|
|
Date: v.Source.PublishTime,
|
|
}
|
|
*i = append(*i, item)
|
|
}
|
|
},
|
|
},
|
|
{
|
|
Name: "环球网",
|
|
SearchUrl: "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=site%3Ahuanqiu.com%20${keyword}&oq=site%3Ahuanqiu.com%20${keyword}&rsv_pq=fd7641bf0000c95a&rsv_t=9e2bJvQ2hjc8VfH%2F%2BWNQlLfJTQiVdsd2EOjNTtBJWMNJwqOgrJBau3sV408&rqlang=cn&rsv_enter=1&gpc=stf%3D1658739916%2C1658826316%7Cstftype%3D1&tfflag=1&si=huanqiu.com&ct=2097152&bs=site%3Ahuanqiu.com%20${keyword}",
|
|
Method: "get",
|
|
ListQuery: "div[class=\"result c-container xpath-log new-pmd\"]",
|
|
QueryHandler: func(i int, selection *goquery.Selection, data *data.FetchData) {
|
|
data.Url, _ = selection.Attr("mu")
|
|
t := selection.Find("h3[class='c-title t t tts-title'] a").First()
|
|
data.Title = t.Text()
|
|
data.Desc = selection.Find(".content-right_8Zs40").First().Text()
|
|
data.Source = selection.Find(".source_1Vdff .c-color-gray").First().Text()
|
|
data.Date = selection.Find("span[class=\"c-color-gray2\"]").First().Text()
|
|
n := compile.FindAllStringSubmatch(data.Date, -1)
|
|
if nil != n {
|
|
nn, _ := strconv.Atoi(n[0][0])
|
|
if strings.Contains(data.Date, "小时") {
|
|
data.Date = ti.Add(-time.Duration(nn) * time.Hour).Format("2006-01-02 15:04")
|
|
}
|
|
if strings.Contains(data.Date, "分钟") {
|
|
data.Date = ti.Add(-time.Duration(nn) * time.Minute).Format("2006-01-02 15:04")
|
|
}
|
|
if strings.Contains(data.Date, "天") {
|
|
data.Date = ti.Add(-time.Duration(nn) * time.Hour * 24).Format("2006-01-02 15:04")
|
|
}
|
|
}
|
|
if strings.Contains(data.Date, "昨天") {
|
|
data.Date = ti.Add(-time.Duration(24) * time.Hour).Format("2006-01-02 15:04")
|
|
}
|
|
if strings.Contains(data.Date, "前天") {
|
|
data.Date = ti.Add(-time.Duration(48) * time.Hour).Format("2006-01-02 15:04")
|
|
}
|
|
},
|
|
Type: "html",
|
|
HeaderFun: func(req *http.Request) {
|
|
req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
|
|
req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9")
|
|
req.Header.Add("Cache-Control", "max-age=0")
|
|
req.Header.Add("Connection", "keep-alive")
|
|
req.Header.Add("Cookie", "ORIGIN=2; ISSW=1; ISSW=1; BAIDUID=D027CF9B0DA3E84567371CDDB93354D9:FG=1; BIDUPSID=D027CF9B0DA3E84567371CDDB93354D9; PSTM=1658826597; delPer=0; BD_CK_SAM=1; PSINO=3; H_PS_PSSID=36826_36557_36624_36726_36413_36841_36949_36166_36919_36816_36569_36803_36742_26350_36930; kleck=f7cc9cf597ac3b3b07c0e39685f3866d; BD_UPN=123353; H_PS_645EC=7751lTb3sntYDTYnQ3dYOUsQm%2F33Fj1OieW5PEXQtyfdM%2BQr%2FIJXHy6B8Go; BA_HECTOR=8k8g04048505a0ag0l813l5e1hdvbr617; ZFY=GeArY5Cvc06Apm6W3TYqUXvnnz4AJG0RjaCGRoJjzTY:C")
|
|
req.Header.Add("Referer", "https://wappass.baidu.com/")
|
|
req.Header.Add("Sec-Fetch-Dest", "document")
|
|
req.Header.Add("Sec-Fetch-Mode", "navigate")
|
|
req.Header.Add("Sec-Fetch-Site", "same-site")
|
|
req.Header.Add("Sec-Fetch-User", "?1")
|
|
req.Header.Add("Upgrade-Insecure-Requests", "1")
|
|
req.Header.Add("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36")
|
|
req.Header.Add("sec-ch-ua", "\".Not/A)Brand\";v=\"99\", \"Google Chrome\";v=\"103\", \"Chromium\";v=\"103\"")
|
|
req.Header.Add("sec-ch-ua-mobile", "?0")
|
|
req.Header.Add("sec-ch-ua-platform", "\"Linux\"")
|
|
},
|
|
},
|
|
{
|
|
Name: "凤凰新闻",
|
|
SearchUrl: "https://shankapi.ifeng.com/season/getSoFengData/all/${keyword}/1/getSoFengDataCallback?callback=getSoFengDataCallback2",
|
|
Type: "ajax",
|
|
Method: "get",
|
|
AjaxDealFun: func(i *[]data.FetchData, response *http.Response) {
|
|
bytes, err := ioutil.ReadAll(response.Body)
|
|
if err != nil {
|
|
log.Println(err)
|
|
return
|
|
}
|
|
r := string(bytes)
|
|
r = strings.Replace(r, "getSoFengDataCallback(", "", 1)
|
|
r = strings.TrimRight(r, ")")
|
|
var res IFENGResponse
|
|
err = json.Unmarshal([]byte(r), &res)
|
|
if err != nil {
|
|
log.Println(err)
|
|
return
|
|
}
|
|
for _, v := range res.Data.Items {
|
|
*i = append(*i, data.FetchData{
|
|
Url: "https:" + v.URL,
|
|
Title: StripTags(v.Title),
|
|
Source: v.Source,
|
|
})
|
|
}
|
|
},
|
|
},
|
|
{
|
|
Name: "人民网",
|
|
SearchUrl: "http://search.people.cn/search-platform/front/search",
|
|
Type: "ajax",
|
|
Method: "post",
|
|
IsJson: true,
|
|
KeywordField: "key",
|
|
Target: reflect.TypeOf(PeopleResponse{}),
|
|
ExternParam: map[string]string{
|
|
"page": "1",
|
|
"limit": "10",
|
|
"hasTitle": "true",
|
|
"hasContent": "true",
|
|
"isFuzzy": "false",
|
|
"type": "0",
|
|
"sortType": "0",
|
|
"startTime": "0",
|
|
"endTime": "0",
|
|
},
|
|
AjaxSimpleDeal: func(i interface{}, v *[]data.FetchData) {
|
|
r := i.(PeopleResponse)
|
|
for _, record := range r.Data.Records {
|
|
tt := time.UnixMilli(int64(record.InputTime))
|
|
*v = append(*v, data.FetchData{
|
|
Url: record.Url,
|
|
Title: StripTags(record.Title),
|
|
Desc: StripTags(record.Content),
|
|
Date: tt.Format("2006-01-02 15:04:05"),
|
|
Source: record.OriginName,
|
|
})
|
|
}
|
|
},
|
|
HeaderFun: func(req *http.Request) {
|
|
req.Header.Set("Content-Type", "application/json")
|
|
req.Header.Add("Cookie", "__jsluid_h=a1b7d0d8dad3604c9393bbcaf36ced1f; sso_c=0; sfr=1; __jsluid_h=fbf7d0abc29ec349c0c0c89c779c268c")
|
|
req.Header.Add("Origin", "http://search.people.cn")
|
|
req.Header.Add("Referer", "http://search.people.cn/s/?keyword=%E7%BA%AA%E6%A3%80&st=0")
|
|
req.Header.Add("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36")
|
|
},
|
|
},
|
|
}
|
|
}
|