用post与get方法抓取网页内容出现问题！

码拜

10年 ago

爬虫 java html url import java.io.BufferedReader; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.NameValuePair; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.PostMethod; public class testSpider { private static HttpClient httpClient=new HttpClient(); //设置代理服务器 /static{ httpClient.getHostConfiguration().setProxy("172.17.18.84", 8080); }/ //用get方法抓取 public static String getHtml(String path)throws HttpException,IOException{ String html=""; GetMethod getMethod=new GetMethod(path); int statusCode=httpClient.executeMethod(getMethod); if(statusCode!=HttpStatus.SC_OK){ System.err.println("Method failed: " + getMethod.getStatusLine()); } html=getMethod.getResponseBodyAsString(); getMethod.releaseConnection(); return html; } public static String downloadPage(String path)throws HttpException,IOException{ String html=""; InputStream inputStream=null; OutputStream outputStream=null; PostMethod postMethod=new PostMethod(path); postMethod.setRequestHeader("accept", "/"); postMethod.setRequestHeader("connection", "Keep-Alive"); postMethod.setRequestHeader("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)"); postMethod.setRequestHeader("Accept-Language", "zh-cn,zh;q=0.5"); int statusCode=httpClient.executeMethod(postMethod); System.out.println(statusCode); //状态码是200 if(statusCode==HttpStatus.SC_OK){ System.out.println("*******************"); String str=postMethod.getResponseBodyAsString(); postMethod.releaseConnection();//释放连接 return str; } //状态码是3XX if((statusCode==HttpStatus.SC_MOVED_TEMPORARILY)\|\|(statusCode==HttpStatus.SC_MOVED_PERMANENTLY)\|\| (statusCode==HttpStatus.SC_SEE_OTHER)\|\| (statusCode==HttpStatus.SC_TEMPORARY_REDIRECT)){ //读取新的URL地址,从头中取出转向的地址 Header header=postMethod.getResponseHeader("location"); String location=null; if(header!=null){ location=header.getValue(); System.out.println("The page was redirected to:" + location); } else{ System.err.println("Location field value is null."); } //return html; } //获取服务器的编码格式 System.out.println(postMethod.getResponseCharSet()); BufferedReader in=new BufferedReader(new InputStreamReader (postMethod.getResponseBodyAsStream(), postMethod.getResponseCharSet())); StringBuffer sb=new StringBuffer(); int chari; while((chari=in.read())!=-1){ sb.append((char)chari); } html=sb.toString(); in.close(); postMethod.releaseConnection(); return html; } public static void main(String args[]){ try{ String pathString="http://www.baidu.com"; System.out.println(testSpider.downloadPage(pathString)); //System.out.println(testSpider.getHtml(pathString)); } catch(HttpException e){ e.printStackTrace(); } catch(IOException e){ e.printStackTrace(); } } } 刚刚开始学习爬虫，按照《自己动手写爬虫》写的代码，发现有不少问题，就从网上融合了多方的代码，结果如上。我的问题：用get方法抓取时可以正常得到，用post方法时（如访问www.baidu.com，出现302转向）访问csdn时出现405，请问该怎样解决？谢谢大家
10分	打开页面（浏览器普通浏览）是GET方法获得页面数据，如果采用POST方法，则没有相应的POST处理程序（action或servlet）所以报错
	baidu最近刚刚变的，http访问首页统统跳转到https的首页
	引用打开页面（浏览器普通浏览）是GET方法获得页面数据，如果采用POST方法，则没有相应的POST处理程序（action或servlet）所以报错这就是说无法使用post抓取网页内容？还是什么意思？刚开始学问了很浅显的问题见谅~
10分	post也是可以的，但是要把这个打开 method.setFollowRedirects(true); PostMethod 默认是setFollowRedirects(false); GetMethod默认是 setFollowRedirects(true); 可以看看PostMethod和GetMethod的源码 public GetMethod() { setFollowRedirects(true); } PostMethod继承EntityEnclosingMethod public EntityEnclosingMethod() { super(); setFollowRedirects(false); }
	POST方法是为浏览器向服务器提交数据准备的方法，你为什么想用POST方法抓？