[AI] 使用Azure Document Intelligence將文件轉Markdown

  • 13
  • 0
  • AI
  • 2026-04-29

雲端版Azure Document Intelligence已能文件轉Markdown格式

只要採用prebuild-layout模型即可

先切割PDF及另存圖片, nuget要安裝SkiaSharp PdfSharpCore PDFtoImage:

using PdfSharpCore.Pdf.IO;
using PDFtoImage;
using SkiaSharp;
using PdfSharpDocument = PdfSharpCore.Pdf.PdfDocument;

namespace Abbee
{
    internal record SinglePage(int num, byte[] content);

    internal static class PdfTool
    {
        public static IEnumerable<SinglePage> GetPdfPages(string pdfPath, bool SaveImg = false)
        {
            using var inputDocument = PdfReader.Open(pdfPath, PdfDocumentOpenMode.Import);
            Console.WriteLine($"開始處理 PDF: {pdfPath}");
            // 讀取 PDF 並取得總頁數
            int totalPages = inputDocument.PageCount;
            var path = pdfPath.Substring(0, pdfPath.LastIndexOf('.'));
            Console.WriteLine($"PDF 共 {totalPages} 頁");
            var options = new RenderOptions { Dpi = 300 };
            var pdfName = Path.Combine(path, "pdf");
            Directory.CreateDirectory(pdfName);
            var imgName = Path.Combine(path, "ocr");
            Directory.CreateDirectory(imgName);
            for (int pageIndex = 0; pageIndex < totalPages;)
            {
                var page = GetSinglePagePdf(inputDocument, pageIndex++);
                File.WriteAllBytesAsync(pdfPath, page);
                if (SaveImg)
                {
                    var imgFile = Path.Combine(imgName, $"{pageIndex:000}.png");
                    if (!File.Exists(imgFile))
                    {
                        var img = Conversion.ToImage(page, 0, null, options);
                        using (var imageData = img.Encode(SKEncodedImageFormat.Png, 100))
                        using (var fs = File.OpenWrite(imgFile))
                            imageData.SaveTo(fs);
                    }
                }
                yield return new SinglePage(pageIndex, page);
            }
        }

        private static byte[] GetSinglePagePdf(PdfSharpDocument inputDocument, int pageIndex)
        {
            using var singlePageDoc = new PdfSharpDocument();
            singlePageDoc.AddPage(inputDocument.Pages[pageIndex]);
            using var ms = new MemoryStream();
            singlePageDoc.Save(ms, false);
            return ms.ToArray();
        }
    }
}

以下是將呼叫以上切割pdf方法, 將每頁轉成markdown檔案(.md)的範例:

using Azure;
using Azure.AI.DocumentIntelligence;
using System.Text;

namespace Abbee
{
    internal class OCR
    {
        public static async Task GetMarkdown(string pdfPath, string output, int tryTimes, string endpoint, string key, bool SaveJpg)
        {
            #region 處理pdf每頁
            var utf8Bom = new UTF8Encoding(encoderShouldEmitUTF8Identifier: true);
            foreach (var singlePagePdf in PdfTool.GetPdfPages(pdfPath, SaveJpg))
            {
                int pageNumber = singlePagePdf.num;
                Console.WriteLine($"正在處理第 {pageNumber} 頁...");
                try
                {
                    #region 呼叫 OCR
                    var txtPath = string.Format(output, pageNumber);
                    Directory.CreateDirectory(System.IO.Path.GetDirectoryName(txtPath)!);
                    string ocrText = string.Empty;
                    if (!File.Exists(txtPath))
                    {
                        for (int i = tryTimes; i > 0; i--)
                        {
                            try
                            {
                                AzureKeyCredential credential = new AzureKeyCredential(key);
                                DocumentIntelligenceClient client = new DocumentIntelligenceClient(new Uri(endpoint), credential);
                                BinaryData bd = new BinaryData(singlePagePdf.content);
                                var options = new AnalyzeDocumentOptions("prebuilt-layout", bd)
                                {
                                    OutputContentFormat = DocumentContentFormat.Markdown
                                };
                                //options.Features.Add(DocumentAnalysisFeature.OcrHighResolution); //高解析度 OCR,會增加處理時間
                                Operation<AnalyzeResult> operation = await client.AnalyzeDocumentAsync(WaitUntil.Completed, options);
                                var result = operation.Value.Content.Trim();
                                File.WriteAllTextAsync(txtPath, result, utf8Bom);
                                break;
                            }
                            catch
                            {
                                if (i == 1) throw;
                            }
                        }
                    }
                    #endregion
                }
                catch (Exception ex)
                {
                    Console.WriteLine($"  第 {pageNumber} 頁 OCR 存檔失敗: {ex}");
                    return;
                }
            }
            #endregion
        }
    }
}

 

Taiwan is a country. 臺灣是我的國家