基于httpclient和jsoup实现的爬虫
【摘要】 创建并设置代理DefaultHttpClient httpclient;proxyHost = prop.getProperty("proxy.host", "默认代理");proxyPort = Integer.parseInt(prop.getProperty("proxy.port", "默认代理端口"));proxyProtocol = prop.getProperty("proxy...
创建并设置代理
DefaultHttpClient httpclient; proxyHost = prop.getProperty("proxy.host", "默认代理"); proxyPort = Integer.parseInt(prop.getProperty("proxy.port", "默认代理端口")); proxyProtocol = prop.getProperty("proxy.protocol", "http"); proxyUser = prop.getProperty("proxy.user"); proxyPwd = prop.getProperty("proxy.pwd"); proxy = new HttpHost(proxyHost, proxyPort, proxyProtocol); httpclient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,proxy); httpclient.getCredentialsProvider().setCredentials(new AuthScope(proxyHost, proxyPort),new UsernamePasswordCredentials(proxyUser, proxyPwd));
登录
@SuppressWarnings("rawtypes") private List setFormData(String user,String pwd) { Properties prop = ConfigFactory.getInstance().getProp(this.getClass().getResource("/").getPath()+CONFIG); List<NameValuePair> nvps = new ArrayList<NameValuePair>(8); Iterator iter = prop.entrySet().iterator(); while (iter.hasNext()) { @SuppressWarnings("unchecked") Map.Entry<String, String> entry = (Map.Entry<String, String>) iter .next(); if (isFormField(entry.getKey())) { if(entry.getKey().equalsIgnoreCase("uid") && user!=null) { nvps.add(new BasicNameValuePair(entry.getKey(), user)); }else if(entry.getKey().equalsIgnoreCase("password") && pwd!=null){ nvps.add(new BasicNameValuePair(entry.getKey(), pwd)); }else{ nvps.add(new BasicNameValuePair(entry.getKey(), entry .getValue())); } } } return nvps; } private static X509TrustManager trustManager = new X509TrustManager() { public X509Certificate[] getAcceptedIssuers() { // TODO Auto-generated // method stub return null; } public void checkServerTrusted( X509Certificate[] arg0, String arg1) throws CertificateException { // TODO Auto-generated // method stub } public void checkClientTrusted( X509Certificate[] arg0, String arg1) throws CertificateException { // TODO Auto-generated // method stub } }; DefaultHttpClient httpclient = new DefaultHttpClient(); // for SSL // SSLContext sslCtx = SSLContext.getInstance("TLS"); // sslCtx.init(null, new TrustManager[] // { trustManager }, null); // // SSLSocketFactory ssf = new SSLSocketFactory(sslCtx); // ssf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER); // ClientConnectionManager ccm = httpclient.getConnectionManager(); // SchemeRegistry sr = ccm.getSchemeRegistry(); // sr.register(new Scheme("https", ssf, 443)); // httpclient = new DefaultHttpClient(ccm, httpclient.getParams()); // for SSL end String postUrl = loginProp.getProperty("login.postUrl"); String encoding = loginProp.getProperty("encoding", "UTF-8"); HttpPost httpost = new HttpPost(postUrl); @SuppressWarnings("unchecked") List<NameValuePair> nvps = setFormData(user,pwd); httpost.setEntity(new UrlEncodedFormEntity(nvps, encoding)); HttpResponse response = httpclient.execute(httpost); HttpEntity entity = response.getEntity(); EntityUtils.consume(entity); logger.info("Post logon cookies:"); List<Cookie> cookies = httpclient.getCookieStore().getCookies(); StringBuffer cookieConcat = new StringBuffer(1024); if (cookies.isEmpty()) { logger.info("None"); } else { for (int i = 0; i < cookies.size(); i++) { cookieConcat.append(cookies.get(i).getName() + "=" + cookies.get(i).getValue() + ";"); } } cookieStr = cookieConcat.toString();
关闭连接
httpclient.getConnectionManager().shutdown();
数据爬取
String cookie = Login.getInstance().getCookies(userName,userPwd);//cookieStr String pattern = ""; if(cookie==null) { log4j.info("爬虫账号错误"); return; } List<PostRecord> list = new ArrayList<PostRecord>(); String crawl1 = url; pattern = "..."; list.addAll(crawl(crawl1, pattern, cookie,"...",0)); private List<PostRecord> crawl(String crawlHtml, String pattern, String cookie,String pcount,int offset){ String html = ""; List<PostRecord> postList = new ArrayList<PostRecord>(); URLFetcher urlFetcher = new URLFetcher(); while (true) { html = urlFetcher.fetch(crawlHtml, encoding, cookie); if (html == null) { for (int i = 0; i < TRY_TIMES; i++) { log4j.warn("try to fetch again: " + crawlHtml + " "); html = urlFetcher.fetch(crawlHtml, encoding, cookie); } } Document doc = Jsoup.parse(html); Elements elements = doc.select(pattern).select("..."); Elements elementCounts = doc.select(pcount); String text = ""; String href = ""; int c = 0,len = elementCounts.size(),count=0; for (Element elem : elements) { text = elem.text(); href = elem.attr("href"); count = 0; if(c+offset<len) { Element celem = elementCounts.get(c+offset); int idex = celem.text().indexOf(" "); count = Integer.parseInt(idex!=-1?celem.text().substring(0, idex):celem.text()); } PostRecord postRec = new PostRecord(text, href,count); postList.add(postRec); c++; } Elements element = doc.select(nextPageSelector); if(!element.isEmpty() ){ crawlHtml = "..." + element.attr("href"); continue; } break; } return postList; }
【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)