PdfExtractionStrategies 1.0.0

Provides functions can extract elements from pdf file. tables, etc. The license use AGPL follows iText 5.5

There is a newer version of this package available.
See the version list below for details.
Install-Package PdfExtractionStrategies -Version 1.0.0
dotnet add package PdfExtractionStrategies --version 1.0.0
<PackageReference Include="PdfExtractionStrategies" Version="1.0.0" />
For projects that support PackageReference, copy this XML node into the project file to reference the package.
paket add PdfExtractionStrategies --version 1.0.0
The NuGet Team does not provide support for this client. Please contact its maintainers for support.

Sample Code:

  1. Extract table from PDF file to HTML:
        /// <summary>
        /// Exact table from page
        /// </summary>
        public void ExactTables()
        {
            using (var reader = new PdfReader(@"c:\path to pdf with table"))
            {
                var strategy = new TableExtractionStrategy();
                var parser = new PdfReaderContentParser(reader);
                parser.ProcessContent(2, strategy);
                foreach (var table in strategy.GetTables())
                {
                    var tableStr = GetSimpleHTMLTable(table);
                    Debug.WriteLine(tableStr);
                }
            }
        }

        private string GetSimpleHTMLTable(PdfTableCell table)
        {
            var sb = new StringBuilder();

            sb.Append("<table>");
            for (int i = 0; i < table.Rows; i++)
            {
                sb.Append("<tr>");
                for (int j = 0; j < table.Cols; j++)
                {
                    sb.Append("<td>");
                    if (table.Children.Count == 0)
                    {
                        sb.Append(table.Text);
                    }
                    else
                    {
                        var cell = table.Children[i * table.Cols + j];
                        if (cell.Children.Count == 0)
                        {
                            sb.Append(cell.Text);
                        }
                        else
                        {
                            sb.Append(GetSimpleHTMLTable(cell));
                        }
                    }

                    sb.Append("</td>");
                }
                sb.Append("</tr>");
            }
            sb.Append("</table>");

            return sb.ToString();
        }

Sample Code:

  1. Extract table from PDF file to HTML:
        /// <summary>
        /// Exact table from page
        /// </summary>
        public void ExactTables()
        {
            using (var reader = new PdfReader(@"c:\path to pdf with table"))
            {
                var strategy = new TableExtractionStrategy();
                var parser = new PdfReaderContentParser(reader);
                parser.ProcessContent(2, strategy);
                foreach (var table in strategy.GetTables())
                {
                    var tableStr = GetSimpleHTMLTable(table);
                    Debug.WriteLine(tableStr);
                }
            }
        }

        private string GetSimpleHTMLTable(PdfTableCell table)
        {
            var sb = new StringBuilder();

            sb.Append("<table>");
            for (int i = 0; i < table.Rows; i++)
            {
                sb.Append("<tr>");
                for (int j = 0; j < table.Cols; j++)
                {
                    sb.Append("<td>");
                    if (table.Children.Count == 0)
                    {
                        sb.Append(table.Text);
                    }
                    else
                    {
                        var cell = table.Children[i * table.Cols + j];
                        if (cell.Children.Count == 0)
                        {
                            sb.Append(cell.Text);
                        }
                        else
                        {
                            sb.Append(GetSimpleHTMLTable(cell));
                        }
                    }

                    sb.Append("</td>");
                }
                sb.Append("</tr>");
            }
            sb.Append("</table>");

            return sb.ToString();
        }

NuGet packages

This package is not used by any NuGet packages.

GitHub repositories

This package is not used by any popular GitHub repositories.

Version History

Version Downloads Last updated
1.0.2 797 1/6/2018
1.0.1 444 12/22/2017
1.0.0 454 12/18/2017