Commit f641d33a by mahaisong

feat:新的pdftojson

parent b2cdc14f
......@@ -155,7 +155,7 @@
<site name="PdfToJson" id="2">
<application path="/" applicationPool="Clr4IntegratedAppPool">
<virtualDirectory path="/" physicalPath="E:\DoSomeThing\smallproject\30.PdfToJson\PdfToJson" />
<virtualDirectory path="/" physicalPath="D:\smallproject\30.PdfToJson\PdfToJson" />
</application>
<bindings>
<binding protocol="http" bindingInformation="*:8001:192.168.2.29" />
......
......@@ -7,17 +7,17 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PdfToJson", "PdfToJson\PdfT
EndProject
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "PocTxt_PY", "PocTxt_PY\PocTxt_PY.pyproj", "{CC6D3D30-970F-4F6E-AD55-6571D7BC9B09}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HTCommon", "..\..\..\codes\ht-intelligent-information\src\HTCommon\HTCommon.csproj", "{E497051E-61E1-4247-93D3-7929A8C8B7F6}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "PdfToJsonModel", "PdfToJsonModel\PdfToJsonModel.csproj", "{ECB8F3D4-6C4C-4A72-BC8E-499F94562048}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Palas.Protocol", "..\..\..\codes\Palas\src\Palas.Protocol\Palas.Protocol.csproj", "{2CE2B29E-933D-4DC4-A37A-E3A874F76BCA}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "EFCoreInitConfig", "..\..\worksapce\Palas\src\EFCoreModels\EFCoreInitConfig\EFCoreInitConfig.csproj", "{28AB438D-DF4A-4C79-81B7-DCBBDC7765E7}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "GILogger", "..\..\..\codes\Palas\src\GILogger\GILogger.csproj", "{3E27E74E-E3B6-4D8C-827F-868A0DCF1D87}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "GILogger", "..\..\worksapce\Palas\src\GILogger\GILogger.csproj", "{0B6C970E-90B1-4D38-A3D3-87594D0470CE}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "EFCoreInitConfig", "..\..\..\codes\Palas\src\EFCoreModels\EFCoreInitConfig\EFCoreInitConfig.csproj", "{D080B0A3-8B2B-4564-A568-24AA27BA0C9D}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Palas.Protocol", "..\..\worksapce\Palas\src\Palas.Protocol\Palas.Protocol.csproj", "{DABEBB25-242F-4063-A285-5D65DF9D50B4}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "PdfToJsonModel", "PdfToJsonModel\PdfToJsonModel.csproj", "{ECB8F3D4-6C4C-4A72-BC8E-499F94562048}"
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HTCommon", "..\..\worksapce\ht-intelligent-information\src\HTCommon\HTCommon.csproj", "{E497051E-61E1-4247-93D3-7929A8C8B7F6}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Thrinax.Parser.PDFParser", "..\..\..\codes\thrinax\src\Thrinax.Parser.PDFParser\Thrinax.Parser.PDFParser.csproj", "{579A06A4-8B76-4407-B47A-2E29EFFC43F9}"
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Thrinax.Parser.PDFParser", "Thrinax.Parser.PDFParser\Thrinax.Parser.PDFParser.csproj", "{579A06A4-8B76-4407-B47A-2E29EFFC43F9}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
......@@ -31,26 +31,26 @@ Global
{20D86D46-C3A0-45D2-AD09-33CC6DF3F4E2}.Release|Any CPU.Build.0 = Release|Any CPU
{CC6D3D30-970F-4F6E-AD55-6571D7BC9B09}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{CC6D3D30-970F-4F6E-AD55-6571D7BC9B09}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E497051E-61E1-4247-93D3-7929A8C8B7F6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{E497051E-61E1-4247-93D3-7929A8C8B7F6}.Debug|Any CPU.Build.0 = Debug|Any CPU
{E497051E-61E1-4247-93D3-7929A8C8B7F6}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E497051E-61E1-4247-93D3-7929A8C8B7F6}.Release|Any CPU.Build.0 = Release|Any CPU
{2CE2B29E-933D-4DC4-A37A-E3A874F76BCA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{2CE2B29E-933D-4DC4-A37A-E3A874F76BCA}.Debug|Any CPU.Build.0 = Debug|Any CPU
{2CE2B29E-933D-4DC4-A37A-E3A874F76BCA}.Release|Any CPU.ActiveCfg = Release|Any CPU
{2CE2B29E-933D-4DC4-A37A-E3A874F76BCA}.Release|Any CPU.Build.0 = Release|Any CPU
{3E27E74E-E3B6-4D8C-827F-868A0DCF1D87}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{3E27E74E-E3B6-4D8C-827F-868A0DCF1D87}.Debug|Any CPU.Build.0 = Debug|Any CPU
{3E27E74E-E3B6-4D8C-827F-868A0DCF1D87}.Release|Any CPU.ActiveCfg = Release|Any CPU
{3E27E74E-E3B6-4D8C-827F-868A0DCF1D87}.Release|Any CPU.Build.0 = Release|Any CPU
{D080B0A3-8B2B-4564-A568-24AA27BA0C9D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D080B0A3-8B2B-4564-A568-24AA27BA0C9D}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D080B0A3-8B2B-4564-A568-24AA27BA0C9D}.Release|Any CPU.ActiveCfg = Release|Any CPU
{D080B0A3-8B2B-4564-A568-24AA27BA0C9D}.Release|Any CPU.Build.0 = Release|Any CPU
{ECB8F3D4-6C4C-4A72-BC8E-499F94562048}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{ECB8F3D4-6C4C-4A72-BC8E-499F94562048}.Debug|Any CPU.Build.0 = Debug|Any CPU
{ECB8F3D4-6C4C-4A72-BC8E-499F94562048}.Release|Any CPU.ActiveCfg = Release|Any CPU
{ECB8F3D4-6C4C-4A72-BC8E-499F94562048}.Release|Any CPU.Build.0 = Release|Any CPU
{28AB438D-DF4A-4C79-81B7-DCBBDC7765E7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{28AB438D-DF4A-4C79-81B7-DCBBDC7765E7}.Debug|Any CPU.Build.0 = Debug|Any CPU
{28AB438D-DF4A-4C79-81B7-DCBBDC7765E7}.Release|Any CPU.ActiveCfg = Release|Any CPU
{28AB438D-DF4A-4C79-81B7-DCBBDC7765E7}.Release|Any CPU.Build.0 = Release|Any CPU
{0B6C970E-90B1-4D38-A3D3-87594D0470CE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{0B6C970E-90B1-4D38-A3D3-87594D0470CE}.Debug|Any CPU.Build.0 = Debug|Any CPU
{0B6C970E-90B1-4D38-A3D3-87594D0470CE}.Release|Any CPU.ActiveCfg = Release|Any CPU
{0B6C970E-90B1-4D38-A3D3-87594D0470CE}.Release|Any CPU.Build.0 = Release|Any CPU
{DABEBB25-242F-4063-A285-5D65DF9D50B4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{DABEBB25-242F-4063-A285-5D65DF9D50B4}.Debug|Any CPU.Build.0 = Debug|Any CPU
{DABEBB25-242F-4063-A285-5D65DF9D50B4}.Release|Any CPU.ActiveCfg = Release|Any CPU
{DABEBB25-242F-4063-A285-5D65DF9D50B4}.Release|Any CPU.Build.0 = Release|Any CPU
{E497051E-61E1-4247-93D3-7929A8C8B7F6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{E497051E-61E1-4247-93D3-7929A8C8B7F6}.Debug|Any CPU.Build.0 = Debug|Any CPU
{E497051E-61E1-4247-93D3-7929A8C8B7F6}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E497051E-61E1-4247-93D3-7929A8C8B7F6}.Release|Any CPU.Build.0 = Release|Any CPU
{579A06A4-8B76-4407-B47A-2E29EFFC43F9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{579A06A4-8B76-4407-B47A-2E29EFFC43F9}.Debug|Any CPU.Build.0 = Debug|Any CPU
{579A06A4-8B76-4407-B47A-2E29EFFC43F9}.Release|Any CPU.ActiveCfg = Release|Any CPU
......
......@@ -45,6 +45,12 @@
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<ItemGroup>
<Reference Include="AngleSharp, Version=0.9.9.0, Culture=neutral, PublicKeyToken=e83494dcdc6d31ea, processorArchitecture=MSIL">
<HintPath>..\packages\AngleSharp.0.9.10\lib\net45\AngleSharp.dll</HintPath>
</Reference>
<Reference Include="HtmlAgilityPack, Version=1.8.5.0, Culture=neutral, PublicKeyToken=bd319b19eaf3b43a, processorArchitecture=MSIL">
<HintPath>..\packages\HtmlAgilityPack.1.8.5\lib\Net45\HtmlAgilityPack.dll</HintPath>
</Reference>
<Reference Include="log4net, Version=2.0.8.0, Culture=neutral, PublicKeyToken=669e0ddf0bb1aa2a, processorArchitecture=MSIL">
<HintPath>..\packages\log4net.2.0.8\lib\net45-full\log4net.dll</HintPath>
</Reference>
......@@ -200,6 +206,9 @@
<Reference Include="tabula">
<HintPath>..\libs\tabula.dll</HintPath>
</Reference>
<Reference Include="WebDriver, Version=3.14.0.0, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\packages\Selenium.WebDriver.3.14.0\lib\net45\WebDriver.dll</HintPath>
</Reference>
<Reference Include="WebGrease">
<Private>True</Private>
<HintPath>..\packages\WebGrease.1.6.0\lib\WebGrease.dll</HintPath>
......@@ -266,9 +275,6 @@
<Content Include="Content\bootstrap.css" />
<Content Include="Content\bootstrap.min.css" />
<Content Include="favicon.ico" />
<Content Include="FileData\文件存储目录.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<Content Include="fonts\glyphicons-halflings-regular.svg" />
<Content Include="Global.asax" />
<Content Include="Scripts\bootstrap.js" />
......@@ -319,6 +325,7 @@
</ItemGroup>
<ItemGroup>
<Folder Include="App_Data\" />
<Folder Include="FileData\" />
<Folder Include="Views\PdfToJson\" />
</ItemGroup>
<ItemGroup>
......@@ -351,30 +358,30 @@
<Content Include="Scripts\jquery-3.3.1.min.map" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\..\codes\ht-intelligent-information\src\HTCommon\HTCommon.csproj">
<Project>{E497051E-61E1-4247-93D3-7929A8C8B7F6}</Project>
<ProjectReference Include="..\..\..\worksapce\ht-intelligent-information\src\HTCommon\HTCommon.csproj">
<Project>{e497051e-61e1-4247-93d3-7929a8c8b7f6}</Project>
<Name>HTCommon</Name>
</ProjectReference>
<ProjectReference Include="..\..\..\..\codes\Palas\src\EFCoreModels\EFCoreInitConfig\EFCoreInitConfig.csproj">
<Project>{d080b0a3-8b2b-4564-a568-24aa27ba0c9d}</Project>
<ProjectReference Include="..\..\..\worksapce\Palas\src\EFCoreModels\EFCoreInitConfig\EFCoreInitConfig.csproj">
<Project>{28ab438d-df4a-4c79-81b7-dcbbdc7765e7}</Project>
<Name>EFCoreInitConfig</Name>
</ProjectReference>
<ProjectReference Include="..\..\..\..\codes\Palas\src\GILogger\GILogger.csproj">
<Project>{3e27e74e-e3b6-4d8c-827f-868a0dcf1d87}</Project>
<ProjectReference Include="..\..\..\worksapce\Palas\src\GILogger\GILogger.csproj">
<Project>{0b6c970e-90b1-4d38-a3d3-87594d0470ce}</Project>
<Name>GILogger</Name>
</ProjectReference>
<ProjectReference Include="..\..\..\..\codes\Palas\src\Palas.Protocol\Palas.Protocol.csproj">
<Project>{2ce2b29e-933d-4dc4-a37a-e3a874f76bca}</Project>
<ProjectReference Include="..\..\..\worksapce\Palas\src\Palas.Protocol\Palas.Protocol.csproj">
<Project>{dabebb25-242f-4063-a285-5d65df9d50b4}</Project>
<Name>Palas.Protocol</Name>
</ProjectReference>
<ProjectReference Include="..\..\..\..\codes\thrinax\src\Thrinax.Parser.PDFParser\Thrinax.Parser.PDFParser.csproj">
<Project>{579a06a4-8b76-4407-b47a-2e29effc43f9}</Project>
<Name>Thrinax.Parser.PDFParser</Name>
</ProjectReference>
<ProjectReference Include="..\PdfToJsonModel\PdfToJsonModel.csproj">
<Project>{ecb8f3d4-6c4c-4a72-bc8e-499f94562048}</Project>
<Name>PdfToJsonModel</Name>
</ProjectReference>
<ProjectReference Include="..\Thrinax.Parser.PDFParser\Thrinax.Parser.PDFParser.csproj">
<Project>{579a06a4-8b76-4407-b47a-2e29effc43f9}</Project>
<Name>Thrinax.Parser.PDFParser</Name>
</ProjectReference>
</ItemGroup>
<PropertyGroup>
<VisualStudioVersion Condition="'$(VisualStudioVersion)' == ''">10.0</VisualStudioVersion>
......@@ -409,7 +416,9 @@
<ErrorText>这台计算机上缺少此项目引用的 NuGet 程序包。使用“NuGet 程序包还原”可下载这些程序包。有关更多信息,请参见 http://go.microsoft.com/fwlink/?LinkID=322105。缺少的文件是 {0}。</ErrorText>
</PropertyGroup>
<Error Condition="!Exists('..\packages\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.2.0.0\build\net46\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.props')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.2.0.0\build\net46\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.props'))" />
<Error Condition="!Exists('..\packages\Selenium.WebDriver.ChromeDriver.2.41.0\build\Selenium.WebDriver.ChromeDriver.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Selenium.WebDriver.ChromeDriver.2.41.0\build\Selenium.WebDriver.ChromeDriver.targets'))" />
</Target>
<Import Project="..\packages\Selenium.WebDriver.ChromeDriver.2.41.0\build\Selenium.WebDriver.ChromeDriver.targets" Condition="Exists('..\packages\Selenium.WebDriver.ChromeDriver.2.41.0\build\Selenium.WebDriver.ChromeDriver.targets')" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
......
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="AngleSharp" version="0.9.10" targetFramework="net461" />
<package id="Antlr" version="3.5.0.2" targetFramework="net461" />
<package id="bootstrap" version="3.3.7" targetFramework="net461" />
<package id="HtmlAgilityPack" version="1.8.5" targetFramework="net461" />
<package id="jQuery" version="3.3.1" targetFramework="net461" />
<package id="log4net" version="2.0.8" targetFramework="net461" />
<package id="Microsoft.AspNet.Mvc" version="5.2.4" targetFramework="net461" />
......@@ -45,6 +47,8 @@
<package id="Pomelo.EntityFrameworkCore.MySql" version="2.1.2" targetFramework="net461" />
<package id="Pomelo.JsonObject" version="2.0.0" targetFramework="net461" />
<package id="Remotion.Linq" version="2.2.0" targetFramework="net461" />
<package id="Selenium.WebDriver" version="3.14.0" targetFramework="net461" />
<package id="Selenium.WebDriver.ChromeDriver" version="2.41.0" targetFramework="net461" />
<package id="System.Buffers" version="4.4.0" targetFramework="net461" />
<package id="System.Collections.Immutable" version="1.5.0" targetFramework="net461" />
<package id="System.ComponentModel.Annotations" version="4.5.0" targetFramework="net461" />
......
......@@ -10,7 +10,10 @@
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\..\codes\Palas\src\EFCoreModels\EFCoreInitConfig\EFCoreInitConfig.csproj" />
<ProjectReference Include="..\..\..\worksapce\ht-intelligent-information\src\HTCommon\HTCommon.csproj" />
<ProjectReference Include="..\..\..\worksapce\Palas\src\EFCoreModels\EFCoreInitConfig\EFCoreInitConfig.csproj" />
<ProjectReference Include="..\..\..\worksapce\Palas\src\GILogger\GILogger.csproj" />
<ProjectReference Include="..\..\..\worksapce\Palas\src\Palas.Protocol\Palas.Protocol.csproj" />
</ItemGroup>
</Project>
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Thrinax.Parser.PDFParser
{
public class ContentRemoveTag
{
public string Content { set; get; }
public int OccurCount { set; get; }
public List<TagPosition> tagPositions { set; get; }
}
public class TagPosition
{
public int PageNumber { set; get; }
public int LineNumber { get; set; }
}
}
using java.io;
using org.apache.pdfbox.pdmodel;
using org.apache.pdfbox.text;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using technology.tabula;
using technology.tabula.extractors;
using technology.tabula.json;
namespace Thrinax.Parser.PDFParser
{
/// <summary>
/// Pdf文件内容解析
/// </summary>
public class PDFParser
{
private static string pageEndMark = "--GIMindPageEnd--";
private static string paragraphEndMark = "--GIMindParagraphEnd--";
private static string tableStartMark = "--GIMindTableStart--";
private static string tableEndMark = "--GIMindTableEnd--";
private static string structStartMark = "--GIMindStructStart--";
private static string structEndMark = "--GIMindStructEnd--";
private static Regex lineRegex = new Regex("\r\n", RegexOptions.Compiled);
/// <summary>
/// 通过文件名解析PDF
/// </summary>
/// <param name="pdfFileName">PDF文件路径</param>
/// <param name="tableContainType">表格包含样式</param>
/// <returns></returns>
public static PDFModel Parser(string pdfFileName, TableContainType tableContainType)
{
if (!System.IO.File.Exists(pdfFileName))
{
return null;
}
//打开文件
PDFModel fileContent = null;
PDDocument reader = null;
try
{
reader = PDDocument.load(new java.io.File(pdfFileName));
fileContent = Parser(reader, tableContainType);
}
catch (Exception ex)
{
if (reader != null)
{
reader.close();
reader = null;
}
return null;
}
finally
{
if (reader != null)
{
reader.close();
reader = null;
}
}
return fileContent;
}
/// <summary>
/// 通过文件流方式解析PDF
/// </summary>
/// <param name="pdfStream">PDF流</param>
/// <param name="tableContainType">表格包含样式</param>
/// <returns></returns>
public static PDFModel Parser(byte[] pdfStream, TableContainType tableContainType)
{
PDFModel fileContent = null;
//打开文件
PDDocument reader = null;
try
{
InputStream sbs = new ByteArrayInputStream(pdfStream);
reader = PDDocument.load(sbs);
fileContent = Parser(reader, tableContainType);
}
catch (Exception ex)
{
if (reader != null)
{
reader.close();
reader = null;
}
return null;
}
finally
{
reader.close();
reader = null;
}
return fileContent;
}
/// <summary>
/// 通过PDF文档对象解析PDF
/// </summary>
/// <param name="pdfDocument">PDF文档</param>
/// <param name="tableContainType">表格包含样式</param>
/// <returns></returns>
public static PDFModel Parser(PDDocument pdfDocument, TableContainType tableContainType)
{
ObjectExtractor extractor = new ObjectExtractor(pdfDocument);
PageIterator pageIterator = extractor.extract();
SpreadsheetExtractionAlgorithm tableExtractor = new SpreadsheetExtractionAlgorithm();
PDFModel pdfModel = new PDFModel();
PDFTextStripper pdfStripper = new PDFTextStripper();
pdfStripper.setPageEnd(pageEndMark);
//pdfStripper.setParagraphEnd(paragraphEndMark);
string[] strs = Regex.Split(pdfStripper.getText(pdfDocument), pageEndMark, RegexOptions.IgnoreCase);
if (strs != null && strs.Length > 0)
{
pdfModel.Pages = new List<PdfPageModel>();
int cp = 0;
while (pageIterator.hasNext())
{
PdfPageModel pdfPage = new PdfPageModel();
pdfPage.CurrentPage = cp + 1;
pdfPage.Text = strs[cp];
List<Table> tables = new List<Table>();
Page page = pageIterator.next();
var pageTables = tableExtractor.extract(page).toArray();
if (pageTables != null && pageTables.Length > 0)
{
for (int i = 0; i < pageTables.Length; i++)
{
tables.Add(pageTables[i] as Table);
}
}
pdfPage.Tables = tables;
pdfModel.Pages.Add(pdfPage);
cp++;
}
pdfModel.PageNumber = pdfModel.Pages.Count;
return PdfTextFormater(pdfModel, tableContainType);
}
return null;
}
/// <summary>
/// 对PDF解析出的文字进行格式化,去掉页眉,页脚,页码,可识别的表格,并尝试在对分页后的数据进行拼接;
/// 对常用的指代与冒号分割的进行表格化或Json化;
/// 表格提供方便显示的CSV格式载入和方便机器计算的Json加载模式;
/// </summary>
/// <param name="pdf">结构化后的PDF</param>
/// <returns></returns>
protected static PDFModel PdfTextFormater(PDFModel pdf, TableContainType tableContainType)
{
StringBuilder sbFileContent = new StringBuilder();
string fileContent = string.Empty;
//猜测非空行的最大长度区间, 统计所有行的字数,去除小于5的部分,获取平均数作为可能的最小非换行字数
int minLineCount = 25;
List<int> countList = new List<int>();
if (pdf.PageNumber > 0 && pdf.Pages != null)
{
//1. 循环所有的页,提取去除前后空格后的文字,使用分隔符将文字分隔为数组
Dictionary<string, ContentRemoveTag> maybeBeginText = new Dictionary<string, ContentRemoveTag>();
Dictionary<string, ContentRemoveTag> maybeEndText = new Dictionary<string, ContentRemoveTag>();
List<TagPosition> needRemovePage = new List<TagPosition>();
for (int page = 1; page <= pdf.Pages.Count; page++)
{
PdfPageModel pdfPageModel = pdf.Pages[page - 1];
//使用换行符拆分字符串
string[] pageTexts = lineRegex.Split(pdfPageModel.Text);
List<int> tempCountList = pageTexts.Select(f => f.Replace(" ", "").Replace(" ", "").Replace("\r", "").Replace("\n", "").Length).Where(f => f > 15).ToList();
if (tempCountList != null && tempCountList.Count > 0)
countList.AddRange(tempCountList);
//获取非空字符串的前三行和后三行的文字部分
int beginGetCount = 0;
int endGetCount = 0;
for (int i = 0; i < pageTexts.Length; i++)
{
if (beginGetCount < 3)
{
TagPosition tagPosition = new TagPosition();
tagPosition.PageNumber = page;
tagPosition.LineNumber = i;
string _cleanText = pageTexts[i].Replace(" ", "").Replace(" ", "").Replace("\r", "").Replace("\n", "");
if (!string.IsNullOrWhiteSpace(_cleanText))
{
int numberCount = NumberOfDigits(_cleanText);
//去掉单行单个数字的行,同时去除分页前后的换行。
if (numberCount == _cleanText.Length && numberCount < 10 && numberCount >= 1)
{
needRemovePage.Add(tagPosition);
}
else
{
if (maybeBeginText.ContainsKey(_cleanText))
{
maybeBeginText[_cleanText].OccurCount++;
maybeBeginText[_cleanText].tagPositions.Add(tagPosition);
}
else
{
maybeBeginText[_cleanText] = new ContentRemoveTag();
maybeBeginText[_cleanText].Content = _cleanText;
maybeBeginText[_cleanText].OccurCount++;
maybeBeginText[_cleanText].tagPositions = new List<TagPosition>();
maybeBeginText[_cleanText].tagPositions.Add(tagPosition);
}
beginGetCount++;
}
}
else if (beginGetCount == 0)
{
needRemovePage.Add(tagPosition);
}
}
if (endGetCount < 3)
{
TagPosition tagPosition = new TagPosition();
tagPosition.PageNumber = page;
tagPosition.LineNumber = pageTexts.Length - i - 1;
string _cleanText = pageTexts[pageTexts.Length - i - 1].Replace(" ", "").Replace(" ", "").Replace("\r", "").Replace("\n", "");
if (!string.IsNullOrWhiteSpace(_cleanText))
{
int numberCount = NumberOfDigits(_cleanText);
//去掉单行单个数字的行,同时去除分页前后的换行。
if (numberCount == _cleanText.Length && numberCount < 10 && numberCount >= 1)
{
needRemovePage.Add(tagPosition);
}
else
{
if (maybeEndText.ContainsKey(_cleanText))
{
maybeEndText[_cleanText].OccurCount++;
maybeEndText[_cleanText].tagPositions.Add(tagPosition);
}
else
{
maybeEndText[_cleanText] = new ContentRemoveTag();
maybeEndText[_cleanText].Content = _cleanText;
maybeEndText[_cleanText].OccurCount++;
maybeEndText[_cleanText].tagPositions = new List<TagPosition>();
maybeEndText[_cleanText].tagPositions.Add(tagPosition);
}
endGetCount++;
}
}
else if (endGetCount == 0)
{
needRemovePage.Add(tagPosition);
}
}
}
}
//比较和记录出现的频率
foreach (var _beginItem in maybeBeginText.Values)
{
if (_beginItem.OccurCount > 2 && _beginItem.OccurCount >= (pdf.Pages.Count - 2))
{
needRemovePage.AddRange(_beginItem.tagPositions);
}
}
foreach (var _endItem in maybeEndText.Values)
{
if (_endItem.OccurCount > 2 && _endItem.OccurCount >= (pdf.Pages.Count - 2))
{
needRemovePage.AddRange(_endItem.tagPositions);
}
}
if (countList != null && countList.Count > 0)
minLineCount = Math.Min(countList.Sum() / countList.Count, 30);
//2. 对段落进行合并和返回
int currentTablePos = 0;
bool isTableStarted = false;
bool lastIsEnd = true;
for (int page = 1; page <= pdf.Pages.Count; page++)
{
//处理上一页遗留的表格数据
if (isTableStarted && page > 1)
{
PdfPageModel lastPdfPageModel = pdf.Pages[page - 2];
if (lastPdfPageModel.Tables != null && lastPdfPageModel.Tables.Count > currentTablePos)
{
string lastTableStr = TableWriter.ToString(lastPdfPageModel.Tables[currentTablePos], tableContainType);
//sbFileContent.AppendLine(tableStartMark);
sbFileContent.AppendLine(lastTableStr.Replace("\r", "").Replace("\n", "\r\n"));
//sbFileContent.AppendLine(tableEndMark);
}
}
PdfPageModel pdfPageModel = pdf.Pages[page - 1];
string[] pageTexts = lineRegex.Split(pdfPageModel.Text);
//对表格进行结构化
List<string> tableStrs = new List<string>();
if (pdfPageModel.Tables != null && pdfPageModel.Tables.Count > 0)
{
foreach (Table table in pdfPageModel.Tables)
{
try
{
tableStrs.Add(TableWriter.ToString(table, tableContainType));
}
catch { }
}
}
currentTablePos = 0;
isTableStarted = false;
//bool needCleanMenu = false;
//清理需要清理的行,并进行合并
for (int i = 0; i < pageTexts.Length; i++)
{
//忽略页码行数据
if (needRemovePage.Any(f => f.PageNumber == page && f.LineNumber == i))
{
lastIsEnd = true;
continue;
}
string cleanText = pageTexts[i];
bool isMatchTable = false;
//判断当前页的表格是否包含,存在的情况将表格列替换为表格位置标识的形式,后续替换为CSV或JSON
tableGuess:
if (tableStrs != null && tableStrs.Count > currentTablePos)
{
if (!string.IsNullOrWhiteSpace(cleanText))
{
string tableStr = tableStrs[currentTablePos];
string[] words = Regex.Split(cleanText, @"[^\u4e00-\u9fa5a-zA-z0-9]+");
if (words != null && words.Length > 0)
{
foreach (var word in words)
{
if (string.IsNullOrWhiteSpace(word))
continue;
if (tableStr.Contains(word))
{
isMatchTable = true;
continue;
}
else
{
isMatchTable = false;
break;
}
}
}
if (isMatchTable)
{
isTableStarted = true;
continue;
}
if (isTableStarted && !isMatchTable)
{
//sbFileContent.AppendLine(tableStartMark);
sbFileContent.AppendLine(tableStr.Replace("\r", "").Replace("\n", "\r\n"));
//sbFileContent.AppendLine(tableEndMark);
lastIsEnd = true;
isTableStarted = false;
currentTablePos++;
goto tableGuess;
}
}
}
//忽略目录部分的数据
string onlyText = cleanText.Replace(" ", "").Replace(" ", "").Replace("\r", "").Replace("\n", "");
if (onlyText == "目录" || onlyText.ToUpper() == "MENU")
{
//needCleanMenu = true;
continue;
}
//if (needCleanMenu)
{
if (string.IsNullOrWhiteSpace(onlyText) || Regex.IsMatch(onlyText, @".*?(\.{6,}\s*\d+)\s*"))
continue;
//else
// needCleanMenu = false;
}
//判断是否以正常中止标点符号结尾
bool endWithStopFlag = cleanText.EndsWith("。") || cleanText.EndsWith("!") || cleanText.EndsWith(":") || cleanText.EndsWith(";");
//判断该行是否包含正文常见标点符号
bool includeNormalFlag = Regex.IsMatch(cleanText, @"[!;,。“]");
//统计非空格字数
int _lineCount = onlyText.Length;
//判断该行字数是否大于最小行字数
bool isLenThanMinLineCount = _lineCount >= minLineCount;
bool firstException = false;
//特例一:存在明显的排序性质的行,如 ◆,(一),■ 等
if (cleanText.StartsWith("◆") || cleanText.StartsWith("■") || cleanText.StartsWith("("))
firstException = true;
//特例二:该行存在:的情况,较大可能是一段的开始
if(!endWithStopFlag && !includeNormalFlag && !isLenThanMinLineCount && (cleanText.Contains(":") || cleanText.Contains(":")))
firstException = true;
//情景一:该行是一段的结尾,加上段落的文字后换行
if (!firstException && endWithStopFlag)
{
sbFileContent.Append(cleanText);
lastIsEnd = true;
}
//情景二:该行是普通的一行,并未结束
else if (!firstException && !endWithStopFlag && isLenThanMinLineCount)
{
sbFileContent.Append(cleanText);
lastIsEnd = false;
}
//情景三:该行是独立行
else if (lastIsEnd && (firstException || (!isLenThanMinLineCount && !endWithStopFlag && !includeNormalFlag)))
{
sbFileContent.AppendLine(cleanText);
lastIsEnd = true;
}
//情景四:该行为独立的段落
else
{
sbFileContent.AppendLine(cleanText);
lastIsEnd = true;
}
}
}
}
//去掉首尾的换行
fileContent = sbFileContent.ToString().Trim('\r', '\n', ' ', '\t');
pdf.Text = fileContent;
return pdf;
}
protected static int NumberOfDigits(string theString)
{
int count = 0;
for (int i = 0; i < theString.Length; i++)
{
if (Char.IsDigit(theString[i]))
{
count++;
}
}
return count;
}
}
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using technology.tabula;
namespace Thrinax.Parser.PDFParser
{
public class PDFModel
{
public int PageNumber { set; get; }
public string FileName { set; get; }
public DateTime CreateTime { set; get; }
public DateTime ModifyTime { set; get; }
public string Author { set; get; }
public string Text { set; get; }
public List<PdfPageModel> Pages { set; get; }
}
public class PdfPageModel
{
public int CurrentPage { set; get; }
public string Text { set; get; }
public List<string> Images { set; get; }
public List<Table> Tables { set; get; }
public List<string> Codes { set; get; }
public List<string> Structs { set; get; }
}
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Thrinax.Parser.PDFParser
{
public class PDFTableMatch
{
}
}
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
// 有关程序集的一般信息由以下
// 控制。更改这些特性值可修改
// 与程序集关联的信息。
[assembly: AssemblyTitle("Thrinax.Parser.PDFParser")]
[assembly: AssemblyDescription("")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("")]
[assembly: AssemblyProduct("Thrinax.Parser.PDFParser")]
[assembly: AssemblyCopyright("Copyright © 2019")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]
// 将 ComVisible 设置为 false 会使此程序集中的类型
//对 COM 组件不可见。如果需要从 COM 访问此程序集中的类型
//请将此类型的 ComVisible 特性设置为 true。
[assembly: ComVisible(false)]
// 如果此项目向 COM 公开,则下列 GUID 用于类型库的 ID
[assembly: Guid("579a06a4-8b76-4407-b47a-2e29effc43f9")]
// 程序集的版本信息由下列四个值组成:
//
// 主版本
// 次版本
// 生成号
// 修订号
//
// 可以指定所有值,也可以使用以下所示的 "*" 预置版本号和修订号
//通过使用 "*",如下所示:
// [assembly: AssemblyVersion("1.0.*")]
[assembly: AssemblyVersion("1.0.0.0")]
[assembly: AssemblyFileVersion("1.0.0.0")]
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Thrinax.Parser.PDFParser
{
/// <summary>
/// 表格引入方式
/// </summary>
public enum TableContainType
{
/// <summary>
/// 使用CSV格式
/// </summary>
CSV = 1,
/// <summary>
/// 使用Json格式
/// </summary>
Json = 2,
/// <summary>
/// 使用TSV格式
/// </summary>
TSV = 3
}
}
using java.io;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using technology.tabula;
using technology.tabula.writers;
namespace Thrinax.Parser.PDFParser
{
public class TableWriter
{
public static bool ToFile(string filePath, Table table, TableContainType tableContainType)
{
FileInfo _csvfilesave = new FileInfo(filePath);
if (!Directory.Exists(_csvfilesave.DirectoryName))
{
Directory.CreateDirectory(_csvfilesave.DirectoryName);
}
java.io.File outputFile = new java.io.File(filePath);
BufferedWriter bufferedWriter = null;
try
{
var fileWriter = new OutputStreamWriter(new FileOutputStream(outputFile.getAbsoluteFile()), "UTF-8");
bufferedWriter = new BufferedWriter(fileWriter);
outputFile.createNewFile();
technology.tabula.writers.Writer writer = null;
switch (tableContainType)
{
case TableContainType.CSV:
writer = new CSVWriter();
break;
case TableContainType.Json:
writer = new JSONWriter();
break;
case TableContainType.TSV:
writer = new TSVWriter();
break;
default:
writer = new JSONWriter();
break;
}
writer.write(bufferedWriter, table);
}
catch
{
return false;
}
return true;
}
public static string ToString(Table table, TableContainType tableContainType)
{
java.io.StringWriter stringWriter = new java.io.StringWriter();
try
{
technology.tabula.writers.Writer writer = null;
switch (tableContainType)
{
case TableContainType.CSV:
writer = new CSVWriter();
break;
case TableContainType.Json:
writer = new JSONWriter();
break;
case TableContainType.TSV:
writer = new TSVWriter();
break;
default:
writer = new JSONWriter();
break;
}
writer.write(stringWriter, table);
}
catch
{
return string.Empty;
}
return stringWriter.toString();
}
}
}
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProjectGuid>{579A06A4-8B76-4407-B47A-2E29EFFC43F9}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>Thrinax.Parser.PDFParser</RootNamespace>
<AssemblyName>Thrinax.Parser.PDFParser</AssemblyName>
<TargetFrameworkVersion>v4.6.1</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<Deterministic>true</Deterministic>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<ItemGroup>
<Reference Include="IKVM.AWT.WinForms, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.AWT.WinForms.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Beans, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Beans.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Charsets, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Charsets.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Cldrdata, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Cldrdata.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Corba, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Corba.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Core, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Core.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Jdbc, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Jdbc.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Localedata, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Localedata.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Management, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Management.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Media, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Media.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Misc, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Misc.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Naming, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Naming.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Nashorn, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Nashorn.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Remoting, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Remoting.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Security, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Security.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.SwingAWT, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.SwingAWT.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Text, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Text.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Tools, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Tools.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.Util, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.Util.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.XML.API, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.XML.API.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.XML.Bind, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.XML.Bind.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.XML.Crypto, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.XML.Crypto.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.XML.Parse, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.XML.Parse.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.XML.Transform, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.XML.Transform.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.XML.WebServices, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.XML.WebServices.dll</HintPath>
</Reference>
<Reference Include="IKVM.OpenJDK.XML.XPath, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.OpenJDK.XML.XPath.dll</HintPath>
</Reference>
<Reference Include="IKVM.Runtime, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.Runtime.dll</HintPath>
</Reference>
<Reference Include="IKVM.Runtime.JNI, Version=8.1.5717.0, Culture=neutral, PublicKeyToken=13235d27fcbfff58, processorArchitecture=MSIL">
<HintPath>..\packages\IKVM.8.1.5717.0\lib\IKVM.Runtime.JNI.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.Core" />
<Reference Include="System.Xml.Linq" />
<Reference Include="System.Data.DataSetExtensions" />
<Reference Include="Microsoft.CSharp" />
<Reference Include="System.Data" />
<Reference Include="System.Net.Http" />
<Reference Include="System.Xml" />
<Reference Include="tabula, Version=0.0.0.0, Culture=neutral, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\libs\tabula.dll</HintPath>
</Reference>
</ItemGroup>
<ItemGroup>
<Compile Include="ContentRemoveTag.cs" />
<Compile Include="PDFModel.cs" />
<Compile Include="PDFParser.cs" />
<Compile Include="PDFTableMatch.cs" />
<Compile Include="TableContainType.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="TableWriter.cs" />
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
</Project>
\ No newline at end of file
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="IKVM" version="8.1.5717.0" targetFramework="net461" />
</packages>
\ No newline at end of file
/*
Navicat MySQL Data Transfer
Source Server : mech.gimind.com
Source Server Version : 50556
Source Host : mech.gimind.com:3306
Source Database : pdftojson
Target Server Type : MYSQL
Target Server Version : 50556
File Encoding : 65001
Date: 2019-02-15 16:23:39
*/
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for `pdftojson`
-- ----------------------------
DROP TABLE IF EXISTS `pdftojson`;
CREATE TABLE `pdftojson` (
`ID` int(11) NOT NULL AUTO_INCREMENT,
`FileName` varchar(1000) DEFAULT NULL,
`CreateTime` datetime DEFAULT NULL,
`PDFURI` varchar(2000) DEFAULT NULL,
`TXTURI` varchar(2000) DEFAULT NULL,
`JSON` text,
`FmtTxt` text,
`CSVURIS` text,
PRIMARY KEY (`ID`)
) ENGINE=InnoDB AUTO_INCREMENT=76 DEFAULT CHARSET=utf8;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment