使用HtmlAgilityPack XPath表达式来抓取博客园数据使用WebClient 下载数据,HtmlAgilityPack XPath表达式解析数据,并绑定到Repeater控件
Web 前端代码 复制代码 代码如下: <%@ Page Language="C#" AutoEventWireup="true" CodeFile="Default.aspx.cs" Inherits="_Default" %> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head runat="server"> <title></title> </head> <body> <form id="form1" runat="server"> <div> <table cellpadding="1" cellspacing="1" bgcolor="#f1f1f1" style="text-align: center"> <asp:Repeater ID="Repeater1" runat="server"> <HeaderTemplate> <tr> <td> 标题 </td> <td> 发布作者 </td> <td> 发布时间 </td> </tr> </HeaderTemplate> <ItemTemplate> <tr bgcolor="#ffffff"> <td align="left"> <a href='<%#Eval("url") %>' target="_blank"> <%#Eval("title") %> </a> </td> <td> <a href='<%#Eval("authorUrl") %>' target="_blank"> <%#Eval("author") %> </a> </td> <td> <%#Eval("updatetime") %> </td> </tr> </ItemTemplate> </asp:Repeater> </table> </div> </form> </body> </html> cs 后台代码: 复制代码 代码如下: using System; using System.Collections.Generic; using System.Linq; using System.Web; using System.Web.UI; using System.Web.UI.WebControls; using S1; using System.Net; using System.IO; using System.Text; using HtmlAgilityPack; public partial class _Default : System.Web.UI.Page { protected void Page_Load(object sender, EventArgs e) { string page = string.Empty; if (!IsPostBack) { WebClient wc = new WebClient(); string address = "http://www.cnblogs.com"; if (!string.IsNullOrEmpty(Request.QueryString["p"])) { address += "/" + Request.QueryString["p"];//分页,p=p2,p=p3 } Stream stream = wc.OpenRead(address); StreamReader sr = new StreamReader(stream, Encoding.UTF8); string html = sr.ReadToEnd(); //实例化HtmlAgilityPack.HtmlDocument对象 HtmlDocument doc = new HtmlDocument(); //载入HTML doc.LoadHtml(html); //根据HTML节点NODE的ID获取节点 HtmlNode navNode = doc.GetElementbyId("post_list"); //div[2]表示文章链接a位于post_list里面第3个div节点中 HtmlNodeCollection list = navNode.SelectNodes("//div[2]/h3/a"); //根据XPATH来索引节点 Cnblogs cnblogs = null; IList<Cnblogs> cnlist = new List<Cnblogs>(); foreach (HtmlNode node in list) { cnblogs = new Cnblogs(); //获取文章链接地址 cnblogs.url = node.Attributes["href"].Value.ToString(); //获取文章标题 cnblogs.title = node.InnerText; cnlist.Add(cnblogs); } HtmlNodeCollection list1 = navNode.SelectNodes("//div[2]/div/a"); for (int i = 0; i < cnlist.Count; i++) { cnlist.author = list1.InnerText; cnlist.authorUrl = list1.Attributes["href"].Value.ToString(); cnlist.updatetime = list1.NextSibling.InnerText.Replace("发布于", "").Trim(); } this.Repeater1.DataSource = cnlist; this.Repeater1.DataBind(); } } public class Cnblogs { public string title { get; set; } public string url { get; set; } public string author { get; set; } public string authorUrl { get; set; } public string updatetime { get; set; } } } |