自動偵測網頁文件編碼(支援HTML5)

  • 3052
  • 0

自動偵測網頁文件編碼(支援HTML5)

為了在一般瀏覽器之外的場景正確顯示網頁文件的文字內容,必須取得網頁文件的編碼名稱,才能透過電腦程式加以處理,下方的程式碼是將《Feroze Daud's WebLog》的《Downloading content from the web using different encodings》[1]的程式碼抽離自特定的使用平台(如控制台),並且移除經過.Net Framework 4.5測試時所發現多餘的程式碼,以及增加對於HTML5的支援。


        {            
            string result = string.Empty;

            WebRequest request = WebRequest.Create(url);
            WebResponse response = request.GetResponse();

            String charset = null;

            if (result.Equals(string.Empty))
            {
                MemoryStream rawdata = new MemoryStream();
                byte[] buffer = new byte[1024];
                Stream rs = response.GetResponseStream();
                //將rs的資料讀入buffer
                //每次讀入1kb
                //傳回值是讀入的bytes數量
                int read = rs.Read(buffer, 0, buffer.Length);
                //倘若rs的資料並未讀取完畢,則繼續讀取。
                while (read > 0)
                {
                    //將buffer的資料寫入rawdata
                    rawdata.Write(buffer, 0, read);
                    //繼續讀取剩餘的資料
                    read = rs.Read(buffer, 0, buffer.Length);
                }

                //Html4
                //<meta http-equiv="content-type" content="text/html; charset=UTF-8">
                //Html5
                //<meta charset="UTF-8">

                MemoryStream ms = rawdata;
                ms.Seek(0, SeekOrigin.Begin);

                StreamReader srr = new StreamReader(ms, Encoding.ASCII);
                String meta = srr.ReadToEnd();

                if (meta != null)
                {
                    int start_ind = meta.IndexOf("charset=");

                    int end_ind = -1;
                    if (start_ind != -1)
                    {
                        end_ind = meta.IndexOf(">", start_ind);
                        if (end_ind != -1)
                        {
                            int start = start_ind + 8;
                            //擷取charset=與>二者之間的字串
                            charset = meta.Substring(start, end_ind - start + 1);
                            //將截取的字串過濾不需要的字元符號
                            charset = charset.Replace(" ", "");
                            charset = charset.Replace("\"", "");
                            charset = charset.Replace("/", "");
                            charset = charset.Replace(">","");
                            result = charset;
                        }
                    }
                }
            }
            response.Close();
            return result;
        } 

參考資料來源:
[1]Downloading content from the web using different encodings
http://blogs.msdn.com/b/feroze_daud/archive/2004/03/30/104440.aspx