之前尝试过使用webBrowser来获取,可能使用的方法不对,获取不了JS执行后的代码,代码如下
namespace WindowsFormsApplication1 { public partial class Form1 : Form { public Form1() { InitializeComponent(); } private void Form1_Load(object sender, EventArgs e) { } private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) { while (this.webBrowser1.ReadyState != WebBrowserReadyState.Complete&&this.webBrowser1.IsBusy!= false) { } string html = this.webBrowser1.DocumentText; textBox2.Text = ""; textBox2.Text += html; } private void button1_Click(object sender, EventArgs e) { this.webBrowser1.Url = new Uri(this.textBox1.Text.Trim()); } } }
请高手指点!可以告知失败原因,但假如可以不使用webBrowser获取JS处理后html代码最好
解决方案
40
手里刚好有一份,你试试吧
private void GetHTMLAfterJS() { FinalHtml html = new FinalHtml(); if (html.Run("bjtime.cn/")) { FileStream stream = File.OpenWrite("out.txt"); StreamWriter writer = new StreamWriter(stream); List<String> linkList = html.LinkList; List<String> imageList = html.ImageList; writer.WriteLine("Link list:"); foreach (String e in linkList) writer.WriteLine(e); writer.WriteLine("Image List:"); foreach (String e in imageList) writer.WriteLine(e); writer.WriteLine("Html Body:"); writer.WriteLine(html.HtmlBody); this.richTextBox1.Text += writer; writer.Close(); } }
这是FinalHtml类的定义:
private void GetHTMLAfterJS() { FinalHtml html = new FinalHtml(); if (html.Run("bjtime.cn/")) { FileStream stream = File.OpenWrite("out.txt"); StreamWriter writer = new StreamWriter(stream); List<String> linkList = html.LinkList; List<String> imageList = html.ImageList; writer.WriteLine("Link list:"); foreach (String e in linkList) writer.WriteLine(e); writer.WriteLine("Image List:"); foreach (String e in imageList) writer.WriteLine(e); writer.WriteLine("Html Body:"); writer.WriteLine(html.HtmlBody); this.richTextBox1.Text += writer; writer.Close(); } } using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading; using System.Windows.Forms; using System.IO; using System.Diagnostics; namespace GetFinalHTML { public class FinalHtml { private String htmlString; private String url; private String htmlTitle; // 获得html title标签的内容 public String HtmlTitle { get { if (success == false) return null; return htmlTitle; } } private List<String> linkList; private List<String> imageList; private bool success; // 能否成功运行 /// <summary> /// 获得网页全部链接的链表, 一定要在Run之后进行 /// </summary> public List<String> LinkList { get { if (success == false) return null; return linkList; } } /// <summary> /// 获得全部图像的标签, 一定要在Run之后进行 /// </summary> public List<String> ImageList { get { if (success == false) return null; return imageList; } } /// <summary> /// 获得执行完js之后的网页body 部分的html代码 /// </summary> public String HtmlBody { get { if (success == false) return null; return htmlString; } } public FinalHtml() { linkList = new List<String>(); imageList = new List<String>(); htmlString = ""; success = false; } /// <summary> /// 检查并补充设置url /// </summary> /// <param name="url"></param> private void CheckURL(String url) { if (!url.StartsWith("http://") && !url.StartsWith("https://") && !url.StartsWith("file:///")) url = "http://" + url; this.url = url; } /// <summary> /// 加载指定文件 /// </summary> /// <param name="url">文件URL</param> /// <param name="timeOut">超时时限</param> /// <returns>能否成功运行,没有超时</returns> public bool Run(String url, int timeOut = 10000) { CheckURL(url); Thread newThread = new Thread(NewThread); newThread.SetApartmentState(ApartmentState.STA);/// 为了创建WebBrowser类的实例 必须将对应线程设为单线程单元 newThread.Start(); //监督子线程运行时间 while (newThread.IsAlive && timeOut > 0) { Thread.Sleep(100); timeOut -= 100; } // 超时处理 if (newThread.IsAlive) { if (success) return true; newThread.Abort(); return false; } return true; } private void NewThread() { new FinalHtmlPerThread(this); Application.Run();// 循环等待webBrowser 加载完毕 调用 DocumentCompleted 事件 } /// <summary> /// 用于处理一个url的核心类 /// </summary> class FinalHtmlPerThread : IDisposable { FinalHtml master; WebBrowser web; public FinalHtmlPerThread(FinalHtml master) { this.master = master; DealWithUrl(); } private void DealWithUrl() { String url = master.url; web = new WebBrowser(); bool success = false; try { web.Url = new Uri(url); web.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(web_DocumentCompleted); // 对事件加委托 success = true; } finally { if (!success) Dispose(); } } public void Dispose() { if (!web.IsDisposed) web.Dispose(); } private void ToList(HtmlElementCollection collection, List<String> list) { System.Collections.IEnumerator it = collection.GetEnumerator(); while (it.MoveNext()) { HtmlElement htmlElement = (HtmlElement)it.Current; list.Add(htmlElement.OuterHtml); } } private void web_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) { //微软官方回答 一个网页有多个Ifram元素就有可能触发多次此事件, 并且提到了 // vb 和 C++ 的解决方案, C# 没有提及, 经本人尝试,发现下面的语句可以判断成功 // 假如未完全加载 web.ReadyState = WebBrowserReadyState.Interactive if (web.ReadyState != WebBrowserReadyState.Complete) return; master.htmlTitle = web.Document.Title; ToList(web.Document.Links, master.linkList); ToList(web.Document.Images, master.imageList); master.htmlString = web.Document.Body.InnerHtml; master.success = true; Thread.CurrentThread.Abort(); } } } }