我們有時候會需要使用他網資料
因此想寫個爬蟲將他網資料下載後並存檔,HtmlAgilityPack這是一個不錯用的3 Party套件
相關方法如下:
於Nuget console 下達指令下載
PM> Install-Package HtmlAgilityPack -Version 1.4.9.5
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text;
/// <summary>
/// Get table value for html
/// Example: GetNodeTableValue("http://www.cqcp.net/game/ssc/", "//*[@id=\"openlist\"]", "./ul[{0}]/li", Encoding.GetEncoding("gb2312"), 1);
/// </summary>
/// <param name="Url">Url path</param>
/// <param name="xPathFirst">All xPath</param>
/// <param name="xPathSecond">Second xPath</param>
/// <param name="encoding">Encoding</param>
/// <param name="TableRowNum">Table row nunber</param>
/// <returns></returns>
public static List<string> GetNodeFirstValue(string Url, string xPathFirst, string xPathSecond, Encoding encoding, int TableRowNum)
{
List<string> ListData = new List<string>();
using (WebClient client = new WebClient())
{
using (MemoryStream ms = new MemoryStream(client.DownloadData(Url)))
{
HtmlDocument doc = new HtmlDocument();
doc.Load(ms, encoding);
// All content
HtmlDocument docStockContext = new HtmlDocument();
docStockContext.LoadHtml(doc.DocumentNode.SelectSingleNode(xPathFirst).InnerHtml);
// Content value
HtmlNodeCollection nodeHeaders = docStockContext.DocumentNode.SelectNodes(string.Format(xPathSecond, TableRowNum));
foreach (HtmlNode nodeHeader in nodeHeaders)
{
ListData.Add(nodeHeader.InnerHtml);
}
}
}
return ListData;
}