雲端版Azure Document Intelligence已能文件轉Markdown格式
只要採用prebuild-layout模型即可
先切割PDF及另存圖片, nuget要安裝SkiaSharp PdfSharpCore PDFtoImage:
using PdfSharpCore.Pdf.IO;
using PDFtoImage;
using SkiaSharp;
using PdfSharpDocument = PdfSharpCore.Pdf.PdfDocument;
namespace Abbee
{
internal record SinglePage(int num, byte[] content);
internal static class PdfTool
{
public static IEnumerable<SinglePage> GetPdfPages(string pdfPath, bool SaveImg = false)
{
using var inputDocument = PdfReader.Open(pdfPath, PdfDocumentOpenMode.Import);
Console.WriteLine($"開始處理 PDF: {pdfPath}");
// 讀取 PDF 並取得總頁數
int totalPages = inputDocument.PageCount;
var path = pdfPath.Substring(0, pdfPath.LastIndexOf('.'));
Console.WriteLine($"PDF 共 {totalPages} 頁");
var options = new RenderOptions { Dpi = 300 };
var pdfName = Path.Combine(path, "pdf");
Directory.CreateDirectory(pdfName);
var imgName = Path.Combine(path, "ocr");
Directory.CreateDirectory(imgName);
for (int pageIndex = 0; pageIndex < totalPages;)
{
var page = GetSinglePagePdf(inputDocument, pageIndex++);
File.WriteAllBytesAsync(pdfPath, page);
if (SaveImg)
{
var imgFile = Path.Combine(imgName, $"{pageIndex:000}.png");
if (!File.Exists(imgFile))
{
var img = Conversion.ToImage(page, 0, null, options);
using (var imageData = img.Encode(SKEncodedImageFormat.Png, 100))
using (var fs = File.OpenWrite(imgFile))
imageData.SaveTo(fs);
}
}
yield return new SinglePage(pageIndex, page);
}
}
private static byte[] GetSinglePagePdf(PdfSharpDocument inputDocument, int pageIndex)
{
using var singlePageDoc = new PdfSharpDocument();
singlePageDoc.AddPage(inputDocument.Pages[pageIndex]);
using var ms = new MemoryStream();
singlePageDoc.Save(ms, false);
return ms.ToArray();
}
}
}
以下是將呼叫以上切割pdf方法, 將每頁轉成markdown檔案(.md)的範例:
using Azure;
using Azure.AI.DocumentIntelligence;
using System.Text;
namespace Abbee
{
internal class OCR
{
public static async Task GetMarkdown(string pdfPath, string output, int tryTimes, string endpoint, string key, bool SaveJpg)
{
#region 處理pdf每頁
var utf8Bom = new UTF8Encoding(encoderShouldEmitUTF8Identifier: true);
foreach (var singlePagePdf in PdfTool.GetPdfPages(pdfPath, SaveJpg))
{
int pageNumber = singlePagePdf.num;
Console.WriteLine($"正在處理第 {pageNumber} 頁...");
try
{
#region 呼叫 OCR
var txtPath = string.Format(output, pageNumber);
Directory.CreateDirectory(System.IO.Path.GetDirectoryName(txtPath)!);
string ocrText = string.Empty;
if (!File.Exists(txtPath))
{
for (int i = tryTimes; i > 0; i--)
{
try
{
AzureKeyCredential credential = new AzureKeyCredential(key);
DocumentIntelligenceClient client = new DocumentIntelligenceClient(new Uri(endpoint), credential);
BinaryData bd = new BinaryData(singlePagePdf.content);
var options = new AnalyzeDocumentOptions("prebuilt-layout", bd)
{
OutputContentFormat = DocumentContentFormat.Markdown
};
//options.Features.Add(DocumentAnalysisFeature.OcrHighResolution); //高解析度 OCR,會增加處理時間
Operation<AnalyzeResult> operation = await client.AnalyzeDocumentAsync(WaitUntil.Completed, options);
var result = operation.Value.Content.Trim();
File.WriteAllTextAsync(txtPath, result, utf8Bom);
break;
}
catch
{
if (i == 1) throw;
}
}
}
#endregion
}
catch (Exception ex)
{
Console.WriteLine($" 第 {pageNumber} 頁 OCR 存檔失敗: {ex}");
return;
}
}
#endregion
}
}
}
Taiwan is a country. 臺灣是我的國家