C#获取JS处理后的html代码

.Net技术 码拜 8年前 (2016-09-23) 3295次浏览
之前尝试过使用webBrowser来获取,可能使用的方法不对,获取不了JS执行后的代码,代码如下

namespace WindowsFormsApplication1
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }
        private void Form1_Load(object sender, EventArgs e)
        {
        }
        private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            while (this.webBrowser1.ReadyState != WebBrowserReadyState.Complete&&this.webBrowser1.IsBusy!= false)
                {                                   
                }
                string html = this.webBrowser1.DocumentText;
            textBox2.Text = "";
            textBox2.Text += html;
        }
        private void button1_Click(object sender, EventArgs e)
        {
            this.webBrowser1.Url = new Uri(this.textBox1.Text.Trim());            
        }
    }
}

请高手指点!可以告知失败原因,但假如可以不使用webBrowser获取JS处理后html代码最好

解决方案

40

手里刚好有一份,你试试吧

		private void GetHTMLAfterJS()
        {
            FinalHtml html = new FinalHtml();
            if (html.Run("bjtime.cn/"))
            {
                FileStream stream = File.OpenWrite("out.txt");
                StreamWriter writer = new StreamWriter(stream);
                List<String> linkList = html.LinkList;
                List<String> imageList = html.ImageList;
                writer.WriteLine("Link list:");
                foreach (String e in linkList)
                    writer.WriteLine(e);
                writer.WriteLine("Image List:");
                foreach (String e in imageList)
                    writer.WriteLine(e);
                writer.WriteLine("Html Body:");
                writer.WriteLine(html.HtmlBody);
                this.richTextBox1.Text += writer;
                writer.Close();
            }
        }

这是FinalHtml类的定义:

		private void GetHTMLAfterJS()
        {
            FinalHtml html = new FinalHtml();
            if (html.Run("bjtime.cn/"))
            {
                FileStream stream = File.OpenWrite("out.txt");
                StreamWriter writer = new StreamWriter(stream);
                List<String> linkList = html.LinkList;
                List<String> imageList = html.ImageList;
                writer.WriteLine("Link list:");
                foreach (String e in linkList)
                    writer.WriteLine(e);
                writer.WriteLine("Image List:");
                foreach (String e in imageList)
                    writer.WriteLine(e);
                writer.WriteLine("Html Body:");
                writer.WriteLine(html.HtmlBody);
                this.richTextBox1.Text += writer;
                writer.Close();
            }
        }


using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Windows.Forms;
using System.IO;
using System.Diagnostics;
namespace GetFinalHTML
{
    public class FinalHtml
    {
        private String htmlString;
        private String url;
        private String htmlTitle;
        // 获得html title标签的内容
        public String HtmlTitle
        {
            get
            {
                if (success == false) return null;
                return htmlTitle;
            }
        }
        private List<String> linkList;
        private List<String> imageList;
        private bool success; // 能否成功运行
        /// <summary>
        /// 获得网页全部链接的链表, 一定要在Run之后进行
        /// </summary>
        public List<String> LinkList
        {
            get
            {
                if (success == false) return null;
                return linkList;
            }
        }
        /// <summary>
        /// 获得全部图像的标签, 一定要在Run之后进行
        /// </summary>
        public List<String> ImageList
        {
            get
            {
                if (success == false) return null;
                return imageList;
            }
        }
        /// <summary>
        /// 获得执行完js之后的网页body 部分的html代码
        /// </summary>
        public String HtmlBody
        {
            get
            {
                if (success == false) return null;
                return htmlString;
            }
        }
        public FinalHtml()
        {
            linkList = new List<String>();
            imageList = new List<String>();
            htmlString = "";
            success = false;
        }
        /// <summary>
        /// 检查并补充设置url
        /// </summary>
        /// <param name="url"></param>
        private void CheckURL(String url)
        {
            if (!url.StartsWith("http://") && !url.StartsWith("https://") && !url.StartsWith("file:///"))
                url = "http://" + url;
            this.url = url;
        }
        /// <summary>
        /// 加载指定文件
        /// </summary>
        /// <param name="url">文件URL</param>
        /// <param name="timeOut">超时时限</param>
        /// <returns>能否成功运行,没有超时</returns>
        public bool Run(String url, int timeOut = 10000)
        {
            CheckURL(url);
            Thread newThread = new Thread(NewThread);
            newThread.SetApartmentState(ApartmentState.STA);/// 为了创建WebBrowser类的实例 必须将对应线程设为单线程单元
            newThread.Start();
            //监督子线程运行时间
            while (newThread.IsAlive && timeOut > 0)
            {
                Thread.Sleep(100);
                timeOut -= 100;
            }
            // 超时处理
            if (newThread.IsAlive)
            {
                if (success) return true;
                newThread.Abort();
                return false;
            }
            return true;
        }
        private void NewThread()
        {
            new FinalHtmlPerThread(this);
            Application.Run();// 循环等待webBrowser 加载完毕 调用 DocumentCompleted 事件
        }
        /// <summary>
        ///  用于处理一个url的核心类
        /// </summary>
        class FinalHtmlPerThread : IDisposable
        {
            FinalHtml master;
            WebBrowser web;
            public FinalHtmlPerThread(FinalHtml master)
            {
                this.master = master;
                DealWithUrl();
            }
            private void DealWithUrl()
            {
                String url = master.url;
                web = new WebBrowser();
                bool success = false;
                try
                {
                    web.Url = new Uri(url);
                    web.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(web_DocumentCompleted); // 对事件加委托
                    success = true;
                }
                finally
                {
                    if (!success)
                        Dispose();
                }
            }
            public void Dispose()
            {
                if (!web.IsDisposed)
                    web.Dispose();
            }
            private void ToList(HtmlElementCollection collection, List<String> list)
            {
                System.Collections.IEnumerator it = collection.GetEnumerator();
                while (it.MoveNext())
                {
                    HtmlElement htmlElement = (HtmlElement)it.Current;
                    list.Add(htmlElement.OuterHtml);
                }
            }
            private void web_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
            {
                //微软官方回答 一个网页有多个Ifram元素就有可能触发多次此事件, 并且提到了
                // vb 和 C++ 的解决方案, C# 没有提及, 经本人尝试,发现下面的语句可以判断成功
                // 假如未完全加载 web.ReadyState = WebBrowserReadyState.Interactive
                if (web.ReadyState != WebBrowserReadyState.Complete) return;
                master.htmlTitle = web.Document.Title;
                ToList(web.Document.Links, master.linkList);
                ToList(web.Document.Images, master.imageList);
                master.htmlString = web.Document.Body.InnerHtml;
                master.success = true;
                Thread.CurrentThread.Abort();
            }
        }
    }
}

CodeBye 版权所有丨如未注明 , 均为原创丨本网站采用BY-NC-SA协议进行授权 , 转载请注明C#获取JS处理后的html代码
喜欢 (0)
[1034331897@qq.com]
分享 (0)