練習題 (11):Input is HTML table, Remove all tags and put data in a comma/tab separated file.
這一題使用 .NET Framework 中處理 Regular Expression 的 Regex 類別、Match 類別、Group 類別,來幫忙擷取 HTML 中的欄位資料。
以下是我輸入的 html 檔案內容:
-
<table width="100" border="1" >
-
-
<td align="center" colspan="2" > >1.5 </td>
-
<td align="right" > 3 <</td>
-
</tr>
-
-
<td align="left" > <4</td>
-
<td align="center" > 5> </td>
-
<td align="right" > <>6 </td>
-
</tr>
-
-
<td align="left" > 7 <> </td>
-
<td colspan="2" align="center" > <8.5> </td>
-
</tr>
-
</table>
|
如下圖所示:
>1.5 |
3< |
<4 |
5> |
<>6 |
7<> |
<8.5> |
|
程式碼:
-
using System.IO;
-
using System.Text.RegularExpressions;
-
namespace TableParser
-
{
-
internal class Program
-
{
-
private static void Main( string [ ] args)
-
{
-
StreamReader sr = new StreamReader ( @"in.html" );
-
StreamWriter sw = new StreamWriter ( "out.txt" );
-
while (sr.Peek ( ) != -1 )
-
{
-
string s = sr.ReadLine ( );
-
Regex r = new Regex ( @"<td\b[^>]*>(.*?)</td>", RegexOptions. IgnoreCase );
-
if (Regex.IsMatch (s, @"</tr>" ) )
-
{
-
sw.WriteLine ( );
-
}
-
else
-
{
-
Match m = r.Match (s);
-
while (m.Success )
-
{
-
Group g = m.Groups [ 1 ];
-
sw.Write (g.ToString ( ) + "\t" );
-
m = m.NextMatch ( );
-
}
-
}
-
}
-
sr.Dispose ( );
-
sw.Flush ( );
-
sw.Dispose ( );
-
}
-
}
-
}
|