|
另外一位网友空间/IV提供的代码,功能同HttpWebRequest获取网页源代码时自动识别网页编码
复制代码 代码如下: using System; using System.Net; using System.Text; using System.Text.RegularExpressions;
class Program { // 获取网页的HTML内容,根据网页的charset自动判断Encoding static string GetHtml(string url) { return GetHtml(url, null); }
// 获取网页的HTML内容,指定Encoding static string GetHtml(string url, Encoding encoding) { byte[] buf = new WebClient().DownloadData(url); if (encoding != null) return encoding.GetString(buf); string html = Encoding.UTF8.GetString(buf); encoding = GetEncoding(html); if (encoding == null || encoding == Encoding.UTF8) return html; return encoding.GetString(buf); }
// 根据网页的HTML内容提取网页的Encoding static Encoding GetEncoding(string html) { string pattern = @"(?i)\bcharset=(?<charset>[-a-zA-Z_0-9]+)"; string charset = Regex.Match(html, pattern).Groups["charset"].Value; try { return Encoding.GetEncoding(charset); } catch (ArgumentException) { return null; } }
// 程序入口 static void Main() { Console.WriteLine(GetHtml(//www.jb51.net));
Console.Read(); } }
|
|