WebClient抓取https页面时得到的字符串全是乱码,源网页为英文,各种编码UTF-8都尝试过没有解决。网上看到的
解决方案一:
使用HttpWebRequest
private string GetWebContent(string sUrl)
{
string strResult = “”;
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(sUrl);
//声明一个HttpWebRequest请求
request.Timeout = 3000000;
//设置连接超时时间
request.Headers.Set(“Pragma”, “no-cache”);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.ToString() != “”)
{
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding(“UTF-8”);//乱码处理
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
}
}
catch (Exception exp)
{
MessageBox.Show(exp.Message);
}
return strResult;
}
此方式没有尝试。也可以从网页响应中获取编码:
/// <summary>
/// 获取网页源代码方法
/// </summary>
/// <param name=”url”>地址</param>
/// <param name=”charSet”>指定编码,如果为空,则自动判断</param>
/// <param name=”out_str”>网页源代码</param>
public static string GetHtml(string url, string charSet)
{
string strWebData = string.Empty;
try
{
byte[] myDataBuffer = my
WebClient.DownloadData(url);
strWebData = System.Text.Encoding.Default.GetString(myDataBuffer);
//获取网页字符编码描述信息
if (string.IsNullOrEmpty(charSet))
{
Match charSetMatch = Regex.Match(strWebData, “<meta([^>]*)charset=(\”)?(.*)?\””, RegexOptions.IgnoreCase | RegexOptions.Multiline);
string webCharSet = charSetMatch.Groups[3].Value.Trim().ToLower();
if (webCharSet != “gb2312”)
{
webCharSet = “utf-8”;
}
if (System.Text.Encoding.GetEncoding(webCharSet) != System.Text.Encoding.Default)
{
strWebData = System.Text.Encoding.GetEncoding(webCharSet).GetString(myDataBuffer);
}
}
}
catch (Exception ex)
{
return null;
}
return strWebData;
}
2 、需要把获取到的https页面字节流通过gzip解压,用这种方法解决了乱码问题。
/// <summary>
/// 获取源代码
/// </summary>
/// <param name=”url”></param>
/// <returns></returns>
public static string GetHtml(string url, Encoding encoding)
{
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request.Timeout = 20000;
request.AllowAutoRedirect = false;
response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals(“gzip”, StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
else
reader = new StreamReader(response.GetResponseStream(), encoding);
string html = reader.ReadToEnd();
return html;
}
}
catch
{
}
finally
{
if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close();
if (request != null)
request = null;
}
return string.Empty;
}
用WebClient抓取
https页面
乱码原因很多,以上方法不保证解决所有可能问题,仅供参考。