且构网

分享程序员开发的那些事...
且构网 - 分享程序员编程开发的那些事

如何在C#中从PDF中提取格式化文本

更新时间:2023-02-12 21:58:32

public string ReadPdfFile(string path)
        {
            string result = "";
            StringBuilder text = new StringBuilder();

            PdfReader pdfReader = new PdfReader(path);

            for (int page = 1; page <= pdfReader.NumberOfPages; page++)
            {
                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                result += PdfTextExtractor.  GetTextFromPage(pdfReader, page, strategy);

                //  result = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.UTF8.GetBytes(result)));
                // text.Append(result);

            }

            pdfReader.Close();
            txtInput.Text = result;
            return result;
        }