package newsource import ( "encoding/json" "fmt" "github.com/PuerkitoBio/goquery" "github/fthvgb1/newsfetch/data" "io/ioutil" "log" "net/http" "reflect" "regexp" "strconv" "strings" "time" "unicode/utf8" ) type Source struct { Name string SearchUrl string Method string Type string KeywordField string ListQuery string QueryHandler func(i int, selection *goquery.Selection, fetchData *data.FetchData) AjaxHandler func(fetchData *data.FetchData) Header map[string]string HeaderFun func(r *http.Request) ExternParam map[string]string AjaxDealFun func(*[]data.FetchData, *http.Response) Target reflect.Type UnmarshalerFun func([]byte) interface{} AjaxSimpleDeal func(interface{}, *[]data.FetchData) IsJson bool } func StripTags(content string) string { re := regexp.MustCompile(`<(.|\n)*?>`) return re.ReplaceAllString(content, "") } func GetSource() []Source { ti := time.Now() compile := regexp.MustCompile(`(\d+)`) nowDate := ti.Format("2006-01-02 15:04:05") return []Source{ { Name: "中央纪委国家监委网站", SearchUrl: "https://www.ccdi.gov.cn/was5/web/search", Method: "post", ListQuery: ".center_box0 li", QueryHandler: func(i int, selection *goquery.Selection, data *data.FetchData) { data.Url, _ = selection.Find("a").First().Attr("href") data.Title = selection.Find("b.title").First().Text() data.Date = selection.Find("span.time").First().Text() }, Type: "html", KeywordField: "searchword", ExternParam: map[string]string{ "channelid": "298814", "orderby": "RELEVANCE", }, HeaderFun: func(req *http.Request) { req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9") req.Header.Add("Cache-Control", "max-age=0") req.Header.Add("Connection", "keep-alive") req.Header.Set("Content-Type", "application/x-www-form-urlencoded") req.Header.Add("Origin", "https://www.ccdi.gov.cn") req.Header.Add("Referer", "https://www.ccdi.gov.cn/") req.Header.Add("Sec-Fetch-Dest", "document") req.Header.Add("Sec-Fetch-Mode", "navigate") req.Header.Add("Sec-Fetch-Site", "same-origin") req.Header.Add("Sec-Fetch-User", "?1") req.Header.Add("Upgrade-Insecure-Requests", "1") req.Header.Add("User-Agent", "Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36") req.Header.Add("sec-ch-ua", "\".Not/A)Brand\";v=\"99\", \"Google Chrome\";v=\"103\", \"Chromium\";v=\"103\"") req.Header.Add("sec-ch-ua-mobile", "?0") req.Header.Add("sec-ch-ua-platform", "\"Windows\"") }, }, { Name: "百度新闻", SearchUrl: "https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&rsv_dl=ns_pc", Method: "get", ListQuery: "div[class=\"result-op c-container xpath-log new-pmd\"]", QueryHandler: func(i int, selection *goquery.Selection, data *data.FetchData) { data.Url, _ = selection.Attr("mu") t := selection.Find(".news-title-font_1xS-F").First() data.Title = t.Text() data.Desc = selection.Find(".c-row .c-color-text").First().Text() data.Source = selection.Find("span[aria-label*=\"新闻来源\"]").First().Text() data.Date = selection.Find("span[class=\"c-color-gray2 c-font-normal c-gap-right-xsmall\"]").First().Text() n := compile.FindAllStringSubmatch(data.Date, -1) if nil != n { nn, _ := strconv.Atoi(n[0][0]) if strings.Contains(data.Date, "小时") { data.Date = ti.Add(-time.Duration(nn) * time.Hour).Format("2006-01-02 15:04") } if strings.Contains(data.Date, "分钟") { data.Date = ti.Add(-time.Duration(nn) * time.Minute).Format("2006-01-02 15:04") } } if strings.Contains(data.Date, "昨天") { data.Date = ti.Add(-time.Duration(24) * time.Hour).Format("2006-01-02 15:04") } if strings.Contains(data.Date, "前天") { data.Date = ti.Add(-time.Duration(48) * time.Hour).Format("2006-01-02 15:04") } }, Type: "html", KeywordField: "word", HeaderFun: func(req *http.Request) { req.Header.Add("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") req.Header.Add("accept-language", "zh-CN,zh;q=0.9") req.Header.Add("cache-control", "no-cache") req.Header.Add("connection", "keep-alive") req.Header.Add("cookie", "BIDUPSID=844E3DCAA2EEBF5C872DC99B967B6B7B; PSTM=1655872163; BAIDUID=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; BD_UPN=123353; ORIGIN=2; ISSW=1; ISSW=1; BAIDUID_BFESS=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; ZFY=jWFAySgO:AoQfb6emY9vnmEdptVao:Anj0FFkp028wFws:C; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=3; COOKIE_SESSION=42_0_2_2_3_0_1_0_2_0_0_0_18_0_51_0_1655888428_0_1655888377%7C3%230_0_1655888377%7C1; BAIDU_WISE_UID=wapp_1655902298617_702; ZD_ENTRY=google; channel=baidusearch; baikeVisitId=b3b23509-9330-4d33-82ae-b8eb37895917; BA_HECTOR=8k2g2g218ga40181ak1hbgg1n14; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDSVRTM=1011; H_PS_PSSID=36550_36459_36673_36455_36453_36692_36165_36695_36697_36569_36075_36467_36316_36651") req.Header.Add("referer", "http://news.baidu.com/") req.Header.Add("sec-fetch-dest", "document") req.Header.Add("sec-fetch-mode", "navigate") req.Header.Add("sec-fetch-site", "cross-site") req.Header.Add("sec-fetch-user", "?1") req.Header.Add("upgrade-insecure-requests", "1") req.Header.Add("user-agent", "Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36") req.Header.Add("#sec-ch-ua", "\".Not/A)Brand\";v=\"99\", \"Google Chrome\";v=\"103\", \"Chromium\";v=\"103\"") req.Header.Add("sec-ch-ua-mobile", "?0") req.Header.Add("sec-ch-ua-platform", "\"Windows\"") req.Header.Add("postman-token", "81407fbc-2b96-54a7-0193-f640156714ab") }, }, { Name: "中国纪检监察报", SearchUrl: "https://jjjcb.ccdi.gov.cn/reader/layout/getSearch.do?beginDocPubTime=&searchField=3&author=&mc=%E4%B8%AD%E5%9B%BD%E7%BA%AA%E6%A3%80%E7%9B%91%E5%AF%9F%E6%8A%A5&endDocPubTime=&pageNo=1&pageSize=10&flag=0&sort=0&asc=1", Method: "get", Type: "ajax", ListQuery: "", KeywordField: "keyword", HeaderFun: func(req *http.Request) { req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9") req.Header.Add("Cache-Control", "max-age=0") req.Header.Add("Connection", "keep-alive") req.Header.Add("Upgrade-Insecure-Requests", "1") req.Header.Add("User-Agent", "Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36") }, AjaxDealFun: func(fetchData *[]data.FetchData, response *http.Response) { bytes, err := ioutil.ReadAll(response.Body) if err != nil { log.Println(err) return } var r CCDIPAGERResponse err = json.Unmarshal(bytes, &r) if err != nil { log.Println(err) return } for _, v := range r.Data.Info { if "" == v.YT { continue } desc := StripTags(v.IRCONTENT) l := utf8.RuneCountInString(desc) if l > 30 { l = 30 } desc = string([]rune(desc)[:30]) d := data.FetchData{ Url: fmt.Sprintf("https://jjjcb.ccdi.gov.cn/epaper/index.html?guid=%s", v.ZBGUID), Title: v.DOCTITLE, Desc: desc, Date: v.DOCPUBTIME, CreatedTime: nowDate, Source: "中国纪检监察报", } *fetchData = append(*fetchData, d) } }, }, { Name: "中新网搜索", SearchUrl: "https://sou.chinanews.com.cn/search.do", KeywordField: "q", Method: "get", Type: "html", ListQuery: "#news_list table", QueryHandler: func(i int, selection *goquery.Selection, fetchData *data.FetchData) { t := selection.Find(".news_item a").First() fetchData.Title = t.Text() fetchData.Url, _ = t.Attr("href") fetchData.Desc = selection.Find(".news_content").First().Text() tt := selection.Find(".news_other").First().Text() fet := strings.Split(tt, "html") fetchData.Date = fet[len(fet)-1] }, }, { Name: "新浪新闻搜索", SearchUrl: "https://search.sina.com.cn/?range=title&c=news&time=&ie=utf-8", Method: "get", Type: "html", KeywordField: "q", ListQuery: "#wrap .box-result", Header: map[string]string{ "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36", "authority": "search.sina.com.cn", "referer": "https://news.sina.com.cn/rol", }, QueryHandler: func(i int, selection *goquery.Selection, fetchData *data.FetchData) { t := selection.Find("h2>a").First() fetchData.Title = t.Text() fetchData.Url, _ = t.Attr("href") fetchData.Desc = selection.Find(".r-info .content").Text() s := selection.Find("h2 >.fgray_time").First().Text() ll := strings.Fields(s) if len(ll) > 2 { fetchData.Date = ll[1] + " " + ll[2] fetchData.Source = ll[0] } else { fetchData.Date = ll[1] fetchData.Source = ll[0] } }, }, { Name: "联合早报", SearchUrl: "https://www.zaobao.com/search?pageNo=1&pageSize=10", Method: "get", Type: "ajax", Header: map[string]string{ "authority": "www.zaobao.com", "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36", }, KeywordField: "keywords", AjaxDealFun: func(i *[]data.FetchData, response *http.Response) { bytes, err := ioutil.ReadAll(response.Body) if err != nil { log.Println(err) return } var r ZaoBaoResponse err = json.Unmarshal(bytes, &r) if err != nil { log.Println(err) return } for _, datum := range r.Result.Data { t := time.UnixMilli(datum.PublicationDate) v := data.FetchData{ Url: "https://www.zaobao.com" + datum.URL, Title: datum.Title, Desc: datum.ContentPreview, Date: t.Format("2006-01-02 15:04:05"), } *i = append(*i, v) } }, }, { Name: "新京报", SearchUrl: "https://s.bjnews.com.cn/bjnews/getlist?from=bw&page=1&orderby=0", KeywordField: "bwsk", Type: "ajax", Method: "get", Header: map[string]string{ "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36", }, Target: reflect.TypeOf(BjNewsResponse{}), AjaxSimpleDeal: func(rr interface{}, i *[]data.FetchData) { r := rr.(BjNewsResponse) for _, v := range r.Data.Data { item := data.FetchData{ Url: v.Source.DetailURL.PcURL, Title: v.Source.Title, Desc: v.Highlight.Desc, Date: v.Source.PublishTime, } *i = append(*i, item) } }, }, { Name: "环球网", SearchUrl: "https://www.baidu.com/s?wd=site:huanqiu.com%20", Method: "get", ListQuery: "div[class=\"result c-container xpath-log new-pmd\"]", QueryHandler: func(i int, selection *goquery.Selection, data *data.FetchData) { data.Url, _ = selection.Attr("mu") t := selection.Find("h3[class='c-title t t tts-title'] a").First() data.Title = t.Text() data.Desc = selection.Find(".content-right_8Zs40").First().Text() data.Source = selection.Find(".source_1Vdff .c-color-gray").First().Text() data.Date = selection.Find("span[class=\"c-color-gray2\"]").First().Text() n := compile.FindAllStringSubmatch(data.Date, -1) if nil != n { nn, _ := strconv.Atoi(n[0][0]) if strings.Contains(data.Date, "小时") { data.Date = ti.Add(-time.Duration(nn) * time.Hour).Format("2006-01-02 15:04") } if strings.Contains(data.Date, "分钟") { data.Date = ti.Add(-time.Duration(nn) * time.Minute).Format("2006-01-02 15:04") } if strings.Contains(data.Date, "天") { data.Date = ti.Add(-time.Duration(nn) * time.Hour * 24).Format("2006-01-02 15:04") } } if strings.Contains(data.Date, "昨天") { data.Date = ti.Add(-time.Duration(24) * time.Hour).Format("2006-01-02 15:04") } if strings.Contains(data.Date, "前天") { data.Date = ti.Add(-time.Duration(48) * time.Hour).Format("2006-01-02 15:04") } }, Type: "html", HeaderFun: func(req *http.Request) { req.Header.Add("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") req.Header.Add("accept-language", "zh-CN,zh;q=0.9") req.Header.Add("cache-control", "no-cache") req.Header.Add("connection", "keep-alive") req.Header.Add("cookie", "BIDUPSID=844E3DCAA2EEBF5C872DC99B967B6B7B; PSTM=1655872163; BAIDUID=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; BD_UPN=123353; ORIGIN=2; ISSW=1; ISSW=1; BAIDUID_BFESS=844E3DCAA2EEBF5CB3E1D79750162204:FG=1; ZFY=jWFAySgO:AoQfb6emY9vnmEdptVao:Anj0FFkp028wFws:C; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=3; COOKIE_SESSION=42_0_2_2_3_0_1_0_2_0_0_0_18_0_51_0_1655888428_0_1655888377%7C3%230_0_1655888377%7C1; BAIDU_WISE_UID=wapp_1655902298617_702; ZD_ENTRY=google; channel=baidusearch; baikeVisitId=b3b23509-9330-4d33-82ae-b8eb37895917; BA_HECTOR=8k2g2g218ga40181ak1hbgg1n14; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDSVRTM=1011; H_PS_PSSID=36550_36459_36673_36455_36453_36692_36165_36695_36697_36569_36075_36467_36316_36651") req.Header.Add("referer", "http://news.baidu.com/") req.Header.Add("sec-fetch-dest", "document") req.Header.Add("sec-fetch-mode", "navigate") req.Header.Add("sec-fetch-site", "cross-site") req.Header.Add("sec-fetch-user", "?1") req.Header.Add("upgrade-insecure-requests", "1") req.Header.Add("user-agent", "Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36") req.Header.Add("#sec-ch-ua", "\".Not/A)Brand\";v=\"99\", \"Google Chrome\";v=\"103\", \"Chromium\";v=\"103\"") req.Header.Add("sec-ch-ua-mobile", "?0") req.Header.Add("sec-ch-ua-platform", "\"Windows\"") req.Header.Add("postman-token", "81407fbc-2b96-54a7-0193-f640156714ab") }, }, { Name: "凤凰新闻", SearchUrl: "https://shankapi.ifeng.com/season/getSoFengData/all/${keyword}/1/getSoFengDataCallback?callback=getSoFengDataCallback2", Type: "ajax", Method: "get", AjaxDealFun: func(i *[]data.FetchData, response *http.Response) { bytes, err := ioutil.ReadAll(response.Body) if err != nil { log.Println(err) return } r := string(bytes) r = strings.Replace(r, "getSoFengDataCallback(", "", 1) r = strings.TrimRight(r, ")") var res IFENGResponse err = json.Unmarshal([]byte(r), &res) if err != nil { log.Println(err) return } for _, v := range res.Data.Items { *i = append(*i, data.FetchData{ Url: "https:" + v.URL, Title: StripTags(v.Title), Source: v.Source, }) } }, }, { Name: "人民网", SearchUrl: "http://search.people.cn/search-platform/front/search", Type: "ajax", Method: "post", IsJson: true, KeywordField: "key", Target: reflect.TypeOf(PeopleResponse{}), ExternParam: map[string]string{ "page": "1", "limit": "10", "hasTitle": "true", "hasContent": "true", "isFuzzy": "false", "type": "0", "sortType": "0", "startTime": "0", "endTime": "0", }, AjaxSimpleDeal: func(i interface{}, v *[]data.FetchData) { r := i.(PeopleResponse) for _, record := range r.Data.Records { tt := time.UnixMilli(int64(record.InputTime)) *v = append(*v, data.FetchData{ Url: record.Url, Title: StripTags(record.Title), Desc: StripTags(record.Content), Date: tt.Format("2006-01-02 15:04:05"), Source: record.OriginName, }) } }, HeaderFun: func(req *http.Request) { req.Header.Set("Content-Type", "application/json") req.Header.Add("Cookie", "__jsluid_h=a1b7d0d8dad3604c9393bbcaf36ced1f; sso_c=0; sfr=1; __jsluid_h=fbf7d0abc29ec349c0c0c89c779c268c") req.Header.Add("Origin", "http://search.people.cn") req.Header.Add("Referer", "http://search.people.cn/s/?keyword=%E7%BA%AA%E6%A3%80&st=0") req.Header.Add("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36") }, }, } }