国庆闲暇时间,写了一个捕获csdn文章的工具。采用了一些简单的算法,希望csdn 不要见怪。 本来想实现图片自动上传,但是没有空,连文章的doc说明也没有仔细写。:) 开发工具:Eclipse3.0 工作平台:WindowXp
/************************************************ * <p>csdn文章采集工具</p> * <p>csdn文章采集工具</p> * <p>CreateData: 2004-10-3 19:59:54</p> * <p>Description:</p> * <p>Copyright: Copyright (c) 2004</p> * <p>Company: 秋水工作室</p> * @author 王凯 * @version 1.0 ***********************************************/ import java.net.*; import java.sql.*; import java.io.*;
public class OpenUrl { /** *得到一个网页地址的方法 **/ public String getContent(String strUrl) // 一个public方法,返回字符串,错误则返回"error open url" { try{ URL url=new URL(strUrl); BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream())); String s=""; StringBuffer sb=new StringBuffer(""); while((s=br.readLine())!=null) { sb.append(s+"\r\n"); } br.close(); return sb.toString(); } catch(Exception e){ return "error open url" + strUrl; } }
/** *得到文章并生成页面 */ public static String GetNews(String Path,String addname,String names){ String body = ""; OpenUrl ou=new OpenUrl(); String htmlbody = ou.getContent(Path); String title=GetSkip(htmlbody,"<span id=\"ArticleTitle1_ArticleTitle1_lblTitle\">", "</span>"); String aboutkey = GetSkip(htmlbody,"<span id=\"ArticleTitle1_ArticleTitle1_lblKeywords\">","</span>"); String content = GetSkip(htmlbody,"<span id=\"ArticleContent1_ArticleContent1_lblContent\">","</span>"); System.out.println("title="+title); // System.out.println("aboutkey="+aboutkey); // System.out.println("content="+content); content.replaceAll("'","''"); // content=skipp(content); // OpenUrl.addnew(title,aboutkey,Path,content); body = "<html><body><title>"+title+"</title></body></html>" + "<body><csdntitle>标题:"+title+"</csdntitle><br>" + "<csdnaboutkey>"+aboutkey+"</csdnaboutkey>" + "<csdnbody>"+content+"</csdnbody>" +"<body></html>"; OpenUrl.scwj("c:\\csnd\\"+addname,title+".htm",body); return body; } /** *过滤<p>原代码,已经取消 **/ public static String skipp(String body){ System.out.println("skipi="+body.indexOf("<P ")); while(body.indexOf("<P ")>=0){ int i = body.indexOf("<P "); String qian = body.substring(0,i); String hou = body.substring(i); int k = hou.indexOf(">"); if(k>=0){ hou = hou.substring(k+1); } body = qian+hou; } while(body.indexOf("<SPAN ")>=0){ int i = body.indexOf("<SPAN "); String qian = body.substring(0,i); String hou = body.substring(i); int k = hou.indexOf(">"); if(k>=0){ hou = hou.substring(k+1); } body = qian+hou; } body.replaceAll("</SPAN>",""); body.replaceAll("</P>",""); return body; }
/** * 得到从spath到epath的内容 **/ public static String GetSkip(String body,String spath,String ePath){ int i = body.indexOf(spath); String skbody=""; if(i>=0){ skbody = body.substring(i+spath.length(),body.length()); int k = skbody.indexOf(ePath); if(k>=0){ skbody = skbody.substring(0,k); }else{ skbody=""; } }else{ skbody=""; } return skbody; }
//具体使用方法 public static void test2(){ OpenUrl ou=new OpenUrl(); String htmlbody = ou.getContent("http://dev.csdn.net/articlelist.aspx?c=6"); while(htmlbody.indexOf("article/")>=0){ int longs = htmlbody.length(); htmlbody = htmlbody.substring(htmlbody.indexOf("article/")+8,longs); String names = htmlbody.substring(0,htmlbody.indexOf("\" target=")); String path = "http://dev.csdn.net/article/"+names; System.out.println(path); int i = names.indexOf("/"); String addname = ""; if (i>=0){ addname = names.substring(0,1); names =names.substring(i+1); } String url = OpenUrl.GetNews(path,addname,names);
} } /** *assess参考方法。 **/ public static boolean addnew(String title,String aboutkey,String pathurl,String body){ boolean addok = false; String odbcQuery; Connection odbcconn; Statement odbcstmt; ResultSet odbcrs;
try{ Class.forName("sun.jdbc.odbc.JdbcOdbcDriver"); }catch (ClassNotFoundException e) { System.out.print ("驱动程序不存在"); } try{ odbcconn = DriverManager.getConnection("jdbc:odbc:csdn"); odbcstmt = odbcconn.createStatement(); odbcQuery="insert into develop (title,aboutkey,pathurl,body)values('" + title+"','" + aboutkey+"','" + pathurl+"','" + body+"')"; // System.out.println(odbcQuery); addok=odbcstmt.execute(odbcQuery); odbcstmt.close(); odbcconn.close(); }catch (SQLException e) { System.out.print (e); }
return addok; } /*** 生成页面文章 ***/ public static boolean scwj(String path,String FileName,String body){ try { File f = new File(path); f.mkdirs(); path=path+"\\"+FileName; f = new File(path); PrintWriter out; out = new PrintWriter(new FileWriter(f)); out.print(body + "\n"); out.close(); } catch (IOException e) { e.printStackTrace(); } catch (Exception e){ e.printStackTrace(); } return false; } public static void main(String args[]) { OpenUrl.test2(); // OpenUrl.GetNews("http://dev.csdn.net/article/40/40149.shtm"); } } 完毕
 
|