Java获取网络文件并插入数据库

2012-11-12

  抓取各大网站的数据插入数据库,这样就不用为没有数据而烦恼了

  获取百度的歌曲名,歌手和链接!!

  package webTools;

  import java.io.BufferedReader;

  import java.io.IOException;

  import java.io.InputStreamReader;

  import java.io.UnsupportedEncodingException;

  import java.net.MalformedURLException;

  import java.net.URL;

  import java.util.ArrayList;

  import java.util.HashMap;

  import java.util.List;

  import java.util.regex.Matcher;

  import java.util.regex.Pattern;

  import dbTools.DBTools;

  public class IOTOWeb {

  public String getHtmlContent(String htmlURL) {

  URL url = null;

  String rowContent = "";

  StringBuffer htmlContent = new StringBuffer();

  try {

  url = new URL(htmlURL);

  BufferedReader in = new BufferedReader(new InputStreamReader(url

  .openStream(), "gb2312"));

  while ((rowContent = in.readLine()) != null) {

  htmlContent.append(rowContent);

  }

  in.close();

  } catch (MalformedURLException e) {

  // TODO Auto-generated catch block

  e.printStackTrace();

  } catch (UnsupportedEncodingException e) {

  // TODO Auto-generated catch block

  e.printStackTrace();

  } catch (IOException e) {

  // TODO Auto-generated catch block

  e.printStackTrace();

  }

  return htmlContent.toString();

  }

  public List getLink(String htmlContent) {

  ArrayList listLink = new ArrayList();

  String regex = "<td[^>]*>[//(]*<a[^>]*href=(/"([^/"]*)/"|/'([^/']*)/'|([^//s>]*))[^>]*>(.*?)[//)]*[//s]*</td>";

  Pattern pattern = Pattern.compile(regex, Pattern.DOTALL);

  Matcher matcher = pattern.matcher(htmlContent);

  while (matcher.find()) {

  listLink.add(matcher.group());

  }

  return listLink;

  }

  public List<String> getHref(String htmlContent) {

  String regex;

  List listtHref = new ArrayList();

  regex = "href=(/"([^/"]*)/"|/'([^/']*)/'|([^//s>]*))/"";

  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

  Matcher ma = pa.matcher(htmlContent);

  while (ma.find()) {

  listtHref.add(ma.group().replaceFirst("href=/"", "").replace("/"",

  ""));

  }

  return listtHref;

  }

  public List<String> getPerson(String htmlContent) {

  String regex;

  List list = new ArrayList();

  regex = "//(<a[^>]*href=(/"([^/"]*)/"|/'([^/']*)/'|([^//s>]*))[^>]*>(.*?)//)";

  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

  Matcher ma = pa.matcher(htmlContent);

  while (ma.find()) {

  list.add(ma.group().replaceFirst("href=/"", "").replace("/"", ""));

  }

  return list;

  }

  public List<String> getSongName(String htmlContent) {

  String regex;

  List listPerson = new ArrayList();

  regex = "<a[^>]*href=(/"([^/"]*)/"|/'([^/']*)/'|([^//s>]*))[^>]*>(.*?)</a>//s";

  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

  Matcher ma = pa.matcher(htmlContent);

  while (ma.find()) {

  listPerson.add(ma.group());

  }

  return listPerson;

  }

  public String getMainContent(String htmlContent) {

  String regex = "<table width=/"100%/" align=/"center/" cellpadding=/"0/" cellspacing=/"0/" class=/"list/">(.*?)</table>";

  StringBuffer mainContent = new StringBuffer();

  Pattern pattern = Pattern.compile(regex, Pattern.DOTALL);

  Matcher matcher = pattern.matcher(htmlContent);

  while (matcher.find()) {

  mainContent.append(matcher.group());

  }

  return mainContent.toString();

  }

  public String outTag(final String s) {

  return s.replaceAll("<.*?>", "");

  }

  DBTools dbTools = new DBTools();

  public void getFromBaiduMap3(String htmlURL) throws Throwable {

  HashMap htmlContentMap = new HashMap();

  String htmlContent = getHtmlContent(htmlURL);

  String mainContent = getMainContent(htmlContent);

  List listLink = getLink(mainContent);

  for (int j = 0; j < listLink.size(); j++) {

  String tdTag = listLink.get(j).toString();

  List songNameList = getSongName(tdTag);

  String songName = outTag(songNameList.get(0).toString());

  List personList = getPerson(tdTag);

  String songPerson = "";

  if (personList.size() != 0) {

  for (int n = 0; n < personList.size(); n++) {

  // System.out.println(personList.get(n).toString());

  songPerson = outTag(personList.get(n).toString());

  }

  } else {

  songPerson = "无";

  }

分享到:
0
相关阅读
友情链接
© 2018 我考网 http://www.woexam.com 中国互联网举报中心 湘ICP备18023104号 京公网安备 11010802020116号
违法和不良信息举报:9447029@qq.com