Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
smallproject
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
mahaisong
smallproject
Commits
65e7ff02
Commit
65e7ff02
authored
Jan 04, 2019
by
mahaisong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:pdftojson py 修改
parent
544bf78e
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
29 additions
and
18 deletions
+29
-18
pocTXTHandler.cpython-36.pyc
...ToJson/PocTxt_PY/__pycache__/pocTXTHandler.cpython-36.pyc
+0
-0
pocTXTHandler.py
30.PdfToJson/PocTxt_PY/pocTXTHandler.py
+29
-18
No files found.
30.PdfToJson/PocTxt_PY/__pycache__/pocTXTHandler.cpython-36.pyc
View file @
65e7ff02
No preview for this file type
30.PdfToJson/PocTxt_PY/pocTXTHandler.py
View file @
65e7ff02
...
...
@@ -158,8 +158,10 @@ class poc:
self
.
file_name
=
re
.
sub
(
'.html'
,
''
,
file
)
self
.
data
=
{
# =============================================================================
# =============================================================================
# '文件名': re.sub('.html', '', file),
# =============================================================================
# =============================================================================
'发行人'
:
self
.
issuer
(),
# =============================================================================
# '发行人': file.split('20')[0],
...
...
@@ -400,23 +402,27 @@ class poc:
else
:
ret
=
''
ret
=
re
.
sub
(
'[:,,:“”【】 为]'
,
''
,
ret
)
if
re
.
search
(
'[0-9]'
,
ret
):
if
re
.
search
(
'[0-9]
+
'
,
ret
):
if
'亿'
in
ret
:
ret
=
re
.
sub
(
u'[
\u4e00
-
\u9fff
]'
,
''
,
ret
)
try
:
ret
=
str
(
float
(
ret
)
*
100000000
)
except
:
ret
=
''
else
:
ret
=
re
.
sub
(
u'[
\u4e00
-
\u9fff
]'
,
''
,
ret
)
if
len
(
ret
)
>
12
:
ret
=
''
else
:
ret
=
re
.
findall
(
r'[一壹二贰三叁四肆五伍六陆七柒八捌九玖十拾百佰千仟万萬亿億]+'
,
ret
)
if
ret
:
ret
=
ret
[
0
]
else
:
ret
=
''
if
not
re
.
search
(
'[0-9]'
,
ret
):
if
not
re
.
search
(
'[0-9]
+
'
,
ret
):
try
:
ret
=
str
(
chinese_to_arabic
(
ret
))
except
Exception
as
err
:
except
:
pass
if
re
.
match
(
r'\d+'
,
ret
):
return
str
(
round
(
float
(
ret
),
2
))
...
...
@@ -500,36 +506,39 @@ class poc:
start
,
end
=
s
.
span
()
ret
=
t
[(
start
+
1
):
end
]
try
:
if
(
t
[
end
]
==
'-'
)
or
(
t
[
end
]
==
'至'
)
or
(
t
[
end
]
==
'、'
)
or
(
t
[
end
]
==
'—'
)
or
(
t
[
end
]
==
'及'
):
if
end
<
len
(
t
)
and
((
t
[
end
]
==
'-'
)
or
(
t
[
end
]
==
'至'
)
or
(
t
[
end
]
==
'、'
)
or
(
t
[
end
]
==
'—'
)
or
(
t
[
end
]
==
'及'
)
):
a
=
re
.
findall
(
'[^
\
-至、—]*?日'
,
t
[(
end
+
1
):])
if
not
a
:
ret
=
re
.
sub
(
':|【|】|期|指|〔|〕| |为|
\
[|
\
]|
\
(|
\
)|(|)|发行|变更'
,
''
,
ret
)
print
(
'aa'
)
return
ret
ret
=
ret
+
'至'
+
a
[
0
]
# =============================================================================
# if re.match([0-9],ret) is False:
# return ''
# =============================================================================
# =============================================================================
# if re.match([0-9],ret) is False:
# return ''
# =============================================================================
ret
=
re
.
sub
(
':|【|】|期|指|〔|〕| |为|
\
[|
\
]|
\
(|
\
)|(|)|发行|变更'
,
''
,
ret
)
tmp_file_name
=
self
.
file_name
tmp_file_name
=
re
.
sub
(
r'\(.{2}\)'
,
''
,
tmp_file_name
)
tmp_file_name
=
re
.
sub
(
r'(.{2})'
,
''
,
tmp_file_name
)
tmp_file_name
=
re
.
sub
(
r'\.txt'
,
''
,
tmp_file_name
)
tmp_file_name
=
re
.
sub
(
r'期'
,
''
,
tmp_file_name
)
ret
=
re
.
sub
(
tmp_file_name
,
''
,
ret
)
if
tmp_file_name
in
ret
:
print
(
'########'
)
ret
=
ret
.
replace
(
tmp_file_name
,
''
)
ret
=
re
.
sub
(
'起息日'
,
''
,
ret
)
ret
=
ret
.
split
(
':'
)[
-
1
]
if
ret
[:
0
]
==
'日'
:
ret
=
ret
[
1
:]
if
re
.
findall
(
'[0-9]+'
,
ret
):
return
ret
[
1
:]
ret
=
ret
.
split
(
':'
)[
-
1
]
if
re
.
search
(
'[0-9]+'
,
ret
):
if
ret
[
0
]
==
'日'
:
return
ret
[
1
:]
else
:
return
ret
else
:
return
''
except
Exception
as
err
:
p
rint
(
err
)
except
:
p
ass
else
:
print
(
'warning:'
,
t
)
return
ret
def
value_date
(
self
):
...
...
@@ -584,8 +593,10 @@ if __name__ == '__main__':
# single instance:
# =============================================================================
# folder= './full'
# =============================================================================
folder
=
'D:
\\
qqFILE
\\
result
\\
full'
folder2
=
'D:
\\
data
\\
full'
filelist
=
os
.
listdir
(
folder
)
filelist
=
[
file
for
file
in
filelist
if
re
.
search
(
'.txt'
,
file
)]
# file = filelist[0]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment