- 微信
- 微博
  
  分享文章到微博
- 复制链接
  
  复制链接到剪贴板

java爬虫下载付费html网页模板

bigsai 发表于 2021/02/03 02:14:44 2021/02/03

【摘要】前言前一段时间我们有一个网页的projiect小项目，要求学习bootstarp。然而自己写的模板和别人写好的东西，无论从美观和手机运行的兼容性上差距都很巨大。中途我们放弃自己写的东西，开始偷别人的模板。有些甚至不会偷的同学甚至还付费下载，都什么年代了，程序员还要花钱买模板。那次结束后，突发奇想能不能写个程序，让他自动下载模板。经过不断努力和解决bug，最终取得了成...

前言

前一段时间我们有一个网页的projiect小项目，要求学习bootstarp。然而自己写的模板和别人写好的东西，无论从美观和手机运行的兼容性上差距都很巨大。中途我们放弃自己写的东西，开始偷别人的模板。有些甚至不会偷的同学甚至还付费下载，都什么年代了，程序员还要花钱买模板。那次结束后，突发奇想能不能写个程序，让他自动下载模板。经过不断努力和解决bug，最终取得了成功。

思路

大致思路为：输入模板的一个页面为url，通过这个链接遍历所有与之有关的链接放到hashset中（采用队列的宽度优先遍历bfs）。这个相关用字符判断链接前面的主要域名地址。（链出去的链接不处理，防止无限扩大）。同时，还要将各种url分类放到不同的set中。

html页面分析：抓取html链接。还要按行读取html文本分析其中可能隐藏的css文件（可能有背景图片）。获取js链接，获取image地址，css地址，（注意一定要储存绝对地址而不是相对地址）。还有的涉及到上层目录。需要处理。

css页面：按行分析。因为css中可能储存背景图片以及其他logo。
js：直接下载保存。
html：下载保存
image：下载保存

注意点：

所有下载链接或者其他活动都要在try catch进行，在catch中跳过这个步骤，执行相应步骤。
下载目录在download自行更改（默认F：//download）
添加jsoup的jar包
有些图片藏在js文件中和css文件中，所以需要去判断js文件和css文件，我这个只分析了css没分析css。
由于精力和时间问题，项目并没有晚上，由于笔者此时正则能力不足，大部分采用字符串分割查找或者contains查找，难免有疏漏
目前代码测试只针对17素材之家部分模板测试有效。其他站点未进行测试
只是小白，代码亢长低水平，大佬勿喷。
附上代码如下：

代码

启动主类getmoban

import java.io.IOException;
import java.util.Iterator;
import java.util.Scanner;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;


public class getmoban {

	public static void main(String[] args) throws IOException
	{
		ExecutorService ex=Executors.newFixedThreadPool(6);
		Scanner sc=new Scanner(System.in);
		System.out.println("请输入网址（别太大否则下载不完）");
		String url=sc.nextLine();
		geturl g=new geturl(url);//
		csssearch cssimage=new csssearch();
		System.out.println(g.file);
		g.judel(); Iterator it=g.htmlurlset.iterator(); while(it.hasNext())
		{ String name=it.next(); try { download download=new download(name); ex.execute(download); } catch(Exception e){} //System.out.println("地址为" name);
		}
		Iterator it2=g.jsset.iterator();
		while(it2.hasNext())
		{ String name=it2.next(); try { download download=new download(name); ex.execute(download); } catch(Exception e){} //System.out.println("js地址为" name);
		}
		Iterator it3=g.cssset.iterator();
		while(it3.hasNext())//css需要过滤其中是否有背景图片
		{ String name=it3.next(); try { download download=new download(name); ex.execute(download); cssimage.searchimage(name); } catch(Exception e){} //System.out.println("css地址为" name);
		}
		Iterator it4=g.imgset.iterator();
		while(it4.hasNext())
		{ String name=it4.next(); try { download download=new download(name); ex.execute(download); } catch(Exception e){} //System.out.println("image地址为" name);
		}
		ex.shutdown();
		//judel();
	}
}

  
 
  1
  2
  3
  4
  5
  6
  7
  8
  9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69

分析链接geturl

import java.io.IOException;
import java.util.ArrayDeque;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Queue;
import java.util.Set;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class geturl { public static String url="http://www.17sucai.com/preview/1/2014-11-28/jQuery用户注册表单验证代码/index.html";
	static String head="http";
	public geturl(String url)
	{
		this.url=url;
	}
	static String file=url;//文件路径
	{
		if(url.contains("http"))
		{ head=file.split("//")[0]; file=file.split("//")[1];
		}
		int last=file.lastIndexOf("/");
		file=file.substring(0, last);
	}
	static Set htmlurlset=new HashSet();//html
	static Set jsset=new HashSet();//js
	static Set imgset=new HashSet();//image
	static Set cssset=new HashSet();//css样式
	static Queue queue=new ArrayDeque();
	
//	public geturl() throws IOException 
//	{this.judel();}
	public static void judel() throws IOException 
	{
		queue.add(url);htmlurlset.add(url);
		while(!queue.isEmpty()&&queue!=null)//要防止链接无限扩大
		{ String teamurl=queue.poll();//弹出头并且删除节点 System.out.println(teamurl); if(!teamurl.endsWith(".com"))//有的网站短小，可能识别有错误	 { if(file.indexOf("/")>0) {if(teamurl.contains(file.substring(0,file.indexOf("/")))) analyze(teamurl);} else analyze(teamurl); }
//			catch(Exception e) {System.out.println("cuo");}			
		} } public static void analyze(String URL)
	{
		try {
	 Document doc;
		doc = Jsoup.connect(URL).timeout(20000).header("user-agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36").ignoreContentType(true).get(); Elements all=doc.select("[class]");//检查 Elements js=doc.getElementsByTag("script"); Elements html=doc.select("a[href]"); Elements img=doc.select("img"); Elements css=doc.select("link[href]"); for(Element e:all) { if(e.attr("style")!="")//找到藏在html的css的图片背景 { String tex=e.attr("style"); if(tex.contains("url")) { String urladress=file; String imgurl=tex.split("url")[1]; imgurl=imgurl.split("\\(")[1].split("\\)")[0];//转义字符串 if(imgurl.startsWith("'")||imgurl.startsWith("\""))//注意转义字符串 { imgurl=imgurl.substring(1,imgurl.length()-1); } while(imgurl.startsWith("..")) { imgurl=imgurl.substring(imgurl.indexOf("/") 1); urladress=urladress.substring(0,urladress.lastIndexOf("/")); } urladress=head "//" urladress "/" imgurl; imgset.add(urladress); } } } for(Element htmlelement:html) { String a=htmlelement.absUrl("href").split("#")[0]; if(!a.equals("")) { if(!htmlurlset.contains(a)&&a.contains(file.substring(0,file.indexOf("/"))))//不存在继续遍历 { queue.add(a);htmlurlset.add(a); //System.out.println(a); } } } for(Element jselement:js)//判断JS { String team=jselement.absUrl("src"); if(!team.equals("")) jsset.add(team);//添加 } for(Element csselement:css) { String team=csselement.absUrl("href"); if(!team.equals(""))//绝对路径 cssset.add(team); // System.out.println(e.attr("href")); } for(Element imageelement:img) { String team=imageelement.absUrl("src"); if(!team.equals(""))//绝对路径 imgset.add(team); //System.out.println(e.attr("href")); }
		}
		catch(Exception e)
		{ if(!queue.isEmpty()) { URL=queue.poll(); analyze(URL);}
		}
	} }

  
 
  1
  2
  3
  4
  5
  6
  7
  8
  9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135

分析css（css可能隐藏图片）csssearch

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

public class csssearch {

	public static void searchimage(String ur) throws IOException {
		if(ur.toLowerCase().contains("bootstarp")) {return;}//bootstarp.css过滤掉，肯定没图片
		Set imgset=new HashSet();
		//String ur="http://demo.cssmoban.com/cssthemes5/cpts_1019_bpi/css/style.css";
		String http="http";
		String fileurl=ur;
		if(fileurl.startsWith("http"))
		{ http=fileurl.split("//")[0];//防止https协议 fileurl=fileurl.split("//")[1];
		}
		fileurl=fileurl.substring(0,fileurl.lastIndexOf("/"));
		//System.out.println(fileurl);//测试
		URL url=new URL(ur); URLConnection conn = url.openConnection();
	 conn.setConnectTimeout(1000);
	 conn.setReadTimeout(5000);
	 conn.connect();
	 InputStream in= conn.getInputStream();
	 InputStreamReader inp=new InputStreamReader(in);
	 BufferedReader buf=new BufferedReader(inp);
	 File file=new File("F:\\download\\" ur.split("//")[1]); if(!file.exists()) { file.getParentFile().mkdirs(); file.createNewFile(); }
		// BufferedOutputStream bufout=new BufferedOutputStream(new FileOutputStream(file)); String tex=""; while((tex=buf.readLine())!=null) {
// System.out.println(tex); if(tex.contains("url")) { String urladress=fileurl; String imgurl=tex.split("url")[1]; imgurl=imgurl.split("\\(")[1].split("\\)")[0];//转义字符串 if(imgurl.startsWith("'")||imgurl.startsWith("\""))//注意转义字符串 { imgurl=imgurl.substring(1,imgurl.length()-1); } //System.out.println(imgurl);//测试 while(imgurl.startsWith("..")) { imgurl=imgurl.substring(imgurl.indexOf("/") 1); urladress=urladress.substring(0,urladress.lastIndexOf("/")); } urladress=http "//" urladress "/" imgurl; //System.out.println(urladress); //down.download(urladress); imgset.add(urladress); } }
	//	 bufout.close(); buf.close(); inp.close(); in.close(); Iterator it=imgset.iterator(); while(it.hasNext()) { String team=it.next(); try { download down=new download(team); Thread t1=new Thread(down); t1.start();System.out.println(team "下载成功");} catch(Exception e) {System.out.println("下载失败：" team);} } }
}


  
 
  1
  2
  3
  4
  5
  6
  7
  8
  9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85

download(线程池下载)

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;

public class download implements Runnable{ public String ur;
	public download() {}
	public download(String ur)
	{
		this.ur=ur;
	}
	public static void download(String ur) throws IOException	
	{ //String ur="http://www.17sucai.com/preview/1266961/2018-06-22/wrj/index.html";
		String fileplace=ur; if(fileplace.contains("http"))
		{ fileplace=fileplace.split("//")[1];
		}
	 URL url = new URL(ur);
	 URLConnection conn = url.openConnection();
	 conn.setConnectTimeout(4000);
	 conn.setReadTimeout(5000);
	 conn.connect();
	 InputStream in= conn.getInputStream(); BufferedInputStream buf=new BufferedInputStream(in);
	 File file=new File("F:\\download\\" fileplace);
	 if(!file.exists())
	 { file.getParentFile().mkdirs(); file.createNewFile();
	 }
	 //System.out.print(file.getAbsolutePath()); 
	 BufferedOutputStream bufout=new BufferedOutputStream(new FileOutputStream(file)); 
//	 int b=0;
//	 while((b=buf.read())!=-1)
//	 {
// bufout.write(b);
// //System.out.println(b "");
//	 }
	 byte b[]=new byte[1024];
	 int n=0;
	 while((n=buf.read(b))!=-1)
	 { bufout.write(b, 0, n);
	 }
	 in.close();
	 buf.close(); bufout.close(); //fullFileName.close();
	 }
	@Override
	public void run() {
		try { download(ur); System.out.println(Thread.currentThread().getName() " 下载" ur "成功");
		} catch (IOException e) { // TODO 自动生成的 catch 块 e.printStackTrace();
		} } }

  
 
  1
  2
  3
  4
  5
  6
  7
  8
  9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74