Crawler: crawls news websites with cookies to get web content

If you use the HTTP protocol to request, the following information will be reported:

Error: sslhandshake error is known. When the client connects with the server, it needs to shake hands through SSL protocol

(2) use: rewrite the defaulthttpclient method to support SSL protocol

package httpsParse;
import java.security.cert.CertificateException;  
import java.security.cert.X509Certificate;  
import javax.net.ssl.SSLContext;  
import javax.net.ssl.TrustManager;  
import javax.net.ssl.X509TrustManager;  
import org.apache.http.conn.ClientConnectionManager;  
import org.apache.http.conn.scheme.Scheme;  
import org.apache.http.conn.scheme.SchemeRegistry;  
import org.apache.http.conn.ssl.SSLSocketFactory;  
import org.apache.http.impl.client.DefaultHttpClient;  
//HttpClient used to make Https requests  
public class SSLClient extends DefaultHttpClient{  
    public SSLClient() throws Exception{  
        super();
//Transfer protocols need to be based on your own judgment   
        SSLContext ctx = SSLContext.getInstance("TLSv1.2");  
        X509TrustManager tm = new X509TrustManager() {  
                @Override  
                public void checkClientTrusted(X509Certificate[] chain,  
                        String authType) throws CertificateException {  
                }  
                @Override  
                public void checkServerTrusted(X509Certificate[] chain,  
                        String authType) throws CertificateException {  
                }  
                @Override  
                public X509Certificate[] getAcceptedIssuers() {  
                    return null;  
                }  
        };  
        ctx.init(null, new TrustManager[]{tm}, null);  
        SSLSocketFactory ssf = new SSLSocketFactory(ctx,SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);  
        ClientConnectionManager ccm = this.getConnectionManager();  
        SchemeRegistry sr = ccm.getSchemeRegistry();  
        sr.register(new Scheme("https", 443, ssf));  
    }  
}

(PIT) and then use httpclient to request the source code of the web page:


    public static void main(String[] args) throws Exception {
    	HttpClientUtil httpClientUtil = new HttpClientUtil();
    	String url = "https://www.yidaiyilu.gov.cn/zchj.htm";
		String html = httpClientUtil.doGet(url);
		System.out.println(html);
	}

Finally, the result is a JS code

<script>var x="@catch@@@d@@toString@@String@@36@pathname@if@@toLowerCase@var@855@captcha@@Array@@@1@@for@1500@@document@@@@chars@attachEvent@addEventListener@substr@Expires@@false@f@0@fromCharCode@innerHTML@@@@8@@@@@@@split@parseInt@createElement@g@new@16@search@May@@https@@reverse@@RegExp@@while@@@charCodeAt@rOm9XFMtA3QKV7nYsPGT4lifyWwkq5vcjH2IdxUoCbhERLaz81DNB6@@10@JgSe0upZ@else@match@0xFF@@@07@length@@e@eval@@@19@@@Path@a@div@setTimeout@cookie@3@5@@0xEDB88320@@GMT@challenge@@@Tue@@@window@@href@return@try@@@@@location@onreadystatechange@function@1557242170@DOMContentLoaded@@firstChild@replace@__jsl_clearance@charAt@join@".replace(/@*$/,"").split("@"),y="g 3b=3q(){31('3o.3h=3o.c+3o.1s.40(/[\\?|&]i-39/,\\'\\')',q);s.32='41=3r.h|19|'+(3q(){g 1i=[3q(3b){3i 2n('9.1a('+3b+')')},(3q(){g 3b=s.1o('30');3b.1b='<2u 3h=\\'/\\'>3l</2u>';3b=3b.3u.3h;g 1i=3b.2f(/20?:\\/\\//)[19];3b=3b.14(1i.2k).f();3i 3q(1i){p(g 3l=19;3l<1i.2k;3l++){1i[3l]=3b.42(1i[3l])};3i 1i.43('')}})()],3l=[[([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[])+[-~~~!{}+[~~[]]-(-~~~!{})],(-~{}+[]+[[]][19])+[~~'']+[-~(+!+{})],[34]+(-~[-~{}-~{}]+[[]][19]),[-~{}-~[-~{}-~{}]]+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19]),(-~{}+[]+[[]][19])+(-~{}+[]+[[]][19])+[-~(+!+{})],(-~{}+[]+[[]][19])+(-~{}+[]+[[]][19])+[-~{}-~[-~{}-~{}]],[33-~(+!+{})-~(+!+{})]+(-~[-~{}-~{}]+[[]][19]),[34]+[-~(+!+{})],[-~~~!{}+[~~[]]-(-~~~!{})]+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19]),[33-~(+!+{})-~(+!+{})]+(-~[-~{}-~{}]+[[]][19]),(-~{}+[]+[[]][19])+[~~'']+[33-~(+!+{})-~(+!+{})]],[(-~{}+[]+[[]][19])+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19]),[33-~(+!+{})-~(+!+{})]],[[34]+[-~{}-~[-~{}-~{}]],(-~[-~{}-~{}]+[[]][19])+[33-~(+!+{})-~(+!+{})],[34]+[~~''],([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[])+([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[]),([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[])+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19]),(-~{}+[]+[[]][19])+(-~{}+[]+[[]][19])+[34]],[(-~{}+[]+[[]][19])+[-~(+!+{})],([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[])],[[34]+(-~{}+[]+[[]][19]),(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19])+[~~''],[34]+[34],[34]+([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[]),[-~{}-~[-~{}-~{}]]+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19])],[(-~{}+[]+[[]][19])+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19]),(-~{}+[]+[[]][19])+[-~(+!+{})]],[([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[])+[-~~~!{}+[~~[]]-(-~~~!{})],(-~[-~{}-~{}]+[[]][19])+[33-~(+!+{})-~(+!+{})],[34]+(-~{}+[]+[[]][19]),([(-~[]<<-~[])]*(((+!+{})+[(-~[]<<-~[])]>>(-~[]<<-~[])))+[])+(((-~[]<<-~[])<<(-~[]<<-~[]))+[[]][19])]];p(g 3b=19;3b<3l.2k;3b++){3l[3b]=1i.22()[(-~{}+[]+[[]][19])](3l[3b])};3i 3l.43('')})()+';15=3c, 2j-1t-2q 1r:1r:2c 38;2t=/;'};d((3q(){3j{3i !!3f.13;}2(2m){3i 17;}})()){s.13('3s',3b,17)}2e{s.12('3p',3b)}",f=function(x,y){var a=0,b=0,c=0;x=x.split("");y=y||99;while((a=x.shift())&&(b=a.charCodeAt(0)-77.5))c=(Math.abs(b)<13?(b+48.5):parseInt(a,36))+y*c;return c},z=f(y.match(/\w/g).sort(function(x,y){return f(x)-f(y)}).pop());while(z++)try{eval(y.replace(/\b\w+\b/g, function(y){return x[f(y,z)-1]||("_"+y)}));break}catch(_){}</script>

At first, I suspected that it was the cause of the cookie. Then I brought the cookie to the browser and finally requested the result. However, the cookie has a validity period. After a period of time, the cookie will be invalid. Therefore, this method will not work. Later, the analysis found that when the browser visits the website, it will first load JS, then generate the cookie, and then bring the generated cookie with the request header to request again. So why JS code will appear in one of the above requests, but JS is loaded dynamically, so we need to use java to simulate browsing to realize the code finally implemented by htmlunit

package cn.server;


import org.openqa.selenium.htmlunit.HtmlUnitDriver;


public class GFDynamicWeb {
	public static HtmlUnitDriver driver = new HtmlUnitDriver();
	public static boolean isGetCookie = false;
//	public static boolean isRepeatExec = false;
	public static String GetContent(String url) {
		if(!isGetCookie) {
			driver.setJavascriptEnabled(true);
			//First load js get cookie
			driver.get(url);
		}
		driver.setJavascriptEnabled(false);
		//Second load page source code
		driver.get(url);
        String pageSource = driver.getPageSource();
        isGetCookie = true;
		return pageSource;
	}
	public static void renewIsGetCookie() {
		isGetCookie = false;
	}
	public static void closeDriver() {
		driver.close();
	}
    public static void main(String[] args) {
    	long s = System.currentTimeMillis();
    	for(int i = 0; i < 100; i ++) {
        	String url = "https://www.yidaiyilu.gov.cn/";
    		String content = GetContent(url);
    		System.out.println(content);
    	}
    	long e = System.currentTimeMillis();
    	System.out.println((e - s)/1000 + "秒");
    	renewIsGetCookie();
    	closeDriver();
    }
}

Website used during the period:

Online interface test

521 status code function

521 error problem solution

Read More: