feat:合并pdftojson、更新python。整体提交一次。

b2cdc14f · mahaisong · e45ab078 · b2cdc14f · b2cdc14f · b2cdc14f
Commit b2cdc14f authored Feb 15, 2019 by mahaisong
11 changed files
--- a/17.identity第三方/identityserver表名大写备份190124.sql
+++ b/17.identity第三方/identityserver表名大写备份190124.sql
--- a/17.identity第三方/大写表名190124更改了哪些表.txt
+++ b/17.identity第三方/大写表名190124更改了哪些表.txt
+b.ToTable("AspNetUsers"); 
+b.ToTable("AspNetUsers"); 
+                    b.ToTable("AspNetRoles");
+                    b.ToTable("AspNetRoleClaims");
+                    b.ToTable("AspNetUserClaims");
+                    b.ToTable("AspNetUserLogins");
+                    b.ToTable("AspNetUserRoles");
+                    b.ToTable("AspNetUserTokens");
+IdentityResources
+IdentityClaims
+ApiResources
+ApiSecrets
+ApiScopes
+ApiClaims
+ApiScopeClaims
+Clients
+ClientGrantTypes
+ClientRedirectUris
+ClientPostLogoutRedirectUris
+ClientScopes
+ClientSecrets
+ClientClaims
+ClientIdPRestrictions
+ClientCorsOrigins
+ClientProperties 
+PersistedGrants
+½һ
+SET FOREIGN_KEY_CHECKS=0;
+-- ----------------------------
+-- Table structure for `apiclaims`
+-- ----------------------------
+DROP TABLE IF EXISTS `ApiClaims`;
+CREATE TABLE `ApiClaims` (
+  `Id` int(11) NOT NULL AUTO_INCREMENT,
+  `ApiResourceId` int(11) NOT NULL,
+  `Type` varchar(200) NOT NULL,
+  PRIMARY KEY (`Id`) USING BTREE,
+  KEY `FK_ApiClaims_ApiResources_ApiResourceId` (`ApiResourceId`) USING BTREE,
+  CONSTRAINT `FK_ApiClaims_ApiResources_ApiResourceId` FOREIGN KEY (`ApiResourceId`) REFERENCES `ApiResources` (`Id`) ON DELETE CASCADE
+) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT;
--- a/23.埋点（国海-积财）/20181010埋点接口说明.docx
+++ b/23.埋点（国海-积财）/20181010埋点接口说明.docx
--- a/26.海通改版/智能资讯系统部署文档.docx
+++ b/26.海通改版/智能资讯系统部署文档.docx
--- a/26.海通改版/海通智能资讯应急预案.docx
+++ b/26.海通改版/海通智能资讯应急预案.docx
--- a/29.IIS日志分析/LogParser.msi
+++ b/29.IIS日志分析/LogParser.msi
--- a/29.IIS日志分析/go.bat
+++ b/29.IIS日志分析/go.bat
+echo off
+echo off
+:: ԴĿ¼
+set sourcePath=D:\Log\input\119log\
+::Ŀ· 
+cd /d C:\Program Files (x86)\Log Parser 2.2
+for /r %sourcePath% %%i in (*.log) do (
+logparser -i:iisw3c -o:csv "SELECT  cs-uri-stem,cs-uri-query, sc-status,c-ip,time-taken,TO_TIMESTAMP(date,time) as TimeStamp  into 'D:\Log\output\119\%%~ni.csv' FROM input\119log\%%~ni.log"  
+)
+pause
\ No newline at end of file
--- a/29.IIS日志分析/海通数据提取.txt
+++ b/29.IIS日志分析/海通数据提取.txt
+林老师您好：
+林老师您好：
+我们想统计一下12月（12月1日0点~~1月1日0点）的接口数据，能不能麻烦您帮助提供。（和上次统计11月份的统计内容相同）
+1.复制:HIVE数据，执行夏阳夏老师的SQL语句，导出文件。
+  提取点击记录.sql
+  提取推荐文章是哪些，是否深度阅读.sql
+2.复制:3台机器下IIS的日志。
+3.在ES中查询可推荐文章条数。(按照下面步骤选择即可)
+3.1打开TAB页面ES-基本查询。 
+3.2搜索 palas_v3的文档，查询条件为 must、 items.pubDate、range、 gt、2018-12-01 
+3.3得到命中数字--即为条数。
--- a/30.PdfToJson/PocTxt_PY/pocTXTHandler.py
+++ b/30.PdfToJson/PocTxt_PY/pocTXTHandler.py
@@ -152,20 +152,11 @@ class poc:
        with open(dir + '/' + file, 'r', encoding='utf-8') as fp:
            for line in fp.readlines():
                self.textlist.append(re.sub('\n', '', line))
-# =============================================================================
+        # self.textlist = [i for i in self.textlist if i != '']
-#         self.textlist = [i for i in self.textlist if i != '']
-# =============================================================================
        self.file_name = re.sub('.html', '', file)
        self.data = {
-# =============================================================================
+            '文件名': re.sub('.html', '', file),
-# =============================================================================
-#             '文件名': re.sub('.html', '', file),
-# =============================================================================
-# =============================================================================
            '发行人': self.issuer(),
-# =============================================================================
-#             '发行人': file.split('20')[0],
-# =============================================================================
            '主承销商': self.lead_underwriter(),
            '联席主承销商': self.joint_lead_underwriter(),
            '发行金额（元）': self.issue_amount(),
@@ -174,32 +165,28 @@ class poc:
            '发行日': self.issue_date(),
            '起息日': self.value_date(),
        }
-        print(self.data['起息日'])
    def issuer(self):
-        iii=self.textlist[0]
+        iii=self.textlist[0].split(',')[-1].split(':')[-1]
        try:
            companyname=re.findall('.*?公司',iii)[0]
-        except Exception as e:
+        except Exception:
            try:
                companyname=re.findall('.*?公司',self.textlist[1])[0]
-            except Exception as e:
+            except Exception:
                try:
                    companyname=re.findall('.*?公司',self.textlist[2])[0]
-                except Exception as e:
+                except Exception:
                     if file.split('20')[0]:
                         companyname=file.split('20')[0]
                     else:
                         print ('')
-        return str(companyname)
+        return str(companyname).split(',')[-1].split(':')[-1]
    def lead_underwriter(self):
        def normalize(text):
            ret = re.findall('.*公司|.*银行', text)[0]
-# =============================================================================
+            # ret = ret .split('联席主承销商')[0]
-#             ret = ret .split('联席主承销商')[0]
-# =============================================================================
            ret = ret.split('\n')[-1]
            ret = ret.split('二')[0]
            ret = ret.split('人')[-1]
@@ -208,8 +195,8 @@ class poc:
            ret = ret. split('即')[-1]
            ret = ret. split(' ')[-1]
            ret = ret. split('：')[-1]
-            ret=  ret. split ('主承销商')[-1]
+            ret = ret. split ('主承销商')[-1]
-            ret=  ret. split ('为')[-1]
+            ret = ret. split ('为')[-1]
            return ret
        mainpos = locate('目', self.textlist)
@@ -225,11 +212,13 @@ class poc:
                ret = re.findall('主承销商.*?公司|主承销商.*?银行', text)[0]
                if re.search('联席主承销商',ret):
                    continue
+                if '《' in ret or '》' in ret:
+                    continue
                ret = ret.split('：')[-1]
                ret = normalize(ret)
                if len(ret)<=4:
                    return ''
-                return ret
+                return ret.split(',')[-1].split(':')[-1]
        for n, text in enumerate(self.textlist[:mainpos]):
            if re.search('主承销商', text):
@@ -237,7 +226,7 @@ class poc:
                if ret:
                    if re.search('联席主承销商',ret[0]):
                        continue
-                    if re.sub('\(|\)|（|）','',normalize(ret[0])) ==re.sub('\(|\)|（|）','',str(self.issuer())):
+                    if re.sub(r'\(|\)|（|）','',normalize(ret[0])) == re.sub(r'\(|\)|（|）','',str(self.issuer())):
                        continue
                    if len(normalize(ret[0]))<4:
                        return ''
@@ -247,12 +236,12 @@ class poc:
                    if ret:
                        if re.search('联席主承销商',ret[0]):
                            continue
-                        if re.sub('\(|\)|（|）','',normalize(ret[0])) ==re.sub('\(|\)|（|）','',str(self.issuer())):
+                        if re.sub(r'\(|\)|（|）','',normalize(ret[0])) == re.sub(r'\(|\)|（|）','',str(self.issuer())):
                            continue
                        if len(normalize(ret[0]))<4:
                            return ''
                        return normalize(ret[0])
        for n, text in enumerate(self.textlist[mainpos2:mainpos2+70]):
            t = ''.join(self.textlist[(n+mainpos2):(n+mainpos2+5)])
            if re.search('主承销商.*公司|主承销商.*银行|簿记管理人.*公司|簿记管理人.*银行', t):
@@ -261,11 +250,10 @@ class poc:
                ret = normalize(t)
                if re.search('年度第',t):
                    continue
-                if re.sub('\(|\)|（|）','',ret) ==re.sub('\(|\)|（|）','',str(self.issuer())):
+                if re.sub(r'\(|\)|（|）','',ret) == re.sub(r'\(|\)|（|）','',str(self.issuer())):
                    continue
-                return ret
+                return ret.split('•')[-1]
-# =============================================================================
 #         for n, text in enumerate(self.textlist[mainpos2:mainpos2+70]):
 #             t = ''.join(self.textlist[(n+mainpos2):(n+mainpos2+5)])
 #             if re.search('簿记管理人.*公司|簿记管理人.*银行', t):
@@ -275,10 +263,7 @@ class poc:
 #                 if ret ==self.issuer():
 #                     continue
 #                 return ret
-# =============================================================================
-# =============================================================================
 #         for n, text in enumerate(self.textlist[mainpos2:mainpos2+70]):
 #             if re.search('主承销商', text):
 #                 ret = re.findall('.*?公司|.*?银行', self.textlist[n+1])
@@ -292,7 +277,6 @@ class poc:
 #                         if re.search('与主承销商签订',ret):
 #                             continue
 #                         return normalize(ret[0])
-# =============================================================================
        if not ret:
            ret = ''
@@ -326,64 +310,67 @@ class poc:
        for n, text in enumerate(self.textlist[:mainpos]):
            if re.search('联席主承销商.*公司|联席主承销商.*银行', text):
                ret = re.findall('联席主承销商.*?公司|联席主承销商.*?银行', text)[0]
+                if '《' in ret or '》' in ret:
+                    continue
                ret = ret.split('：')[-1]
                ret = normalize(ret)
-                print (1)
+                if re.sub(r'\(|\)|\（|\）','',ret) == re.sub(r'\(|\)|\（|\）','',str(self.issuer())):
-                if re.sub('\(|\)|\（|\）','',ret) ==re.sub('\(|\)|\（|\）','',str(self.issuer())):
                    continue
-# =============================================================================
+                # if re.search('联席主承销商：', ret):
-#                 if re.search('联席主承销商：', ret):
+                #    continue
-#                     continue
+                return ret.split(':')[-1]
-# =============================================================================
-                return ret
        for n, text in enumerate(self.textlist[:mainpos]):
            if re.search('联席主承销商', text):
                ret = re.findall('.*?公司|.*?银行', self.textlist[n + 1])
                if ret:
                    ret = normalize(ret[0])
-                    if re.sub('\(|\)|（|）','',ret) ==re.sub('\(|\)|（|）','',str(self.issuer())):
+                    if '《' in ret or '》' in ret:
                        continue
-                    return ret
+                    if re.sub(r'\(|\)|（|）','',ret) == re.sub(r'\(|\)|（|）','',str(self.issuer())):
+                        continue
+                    return ret.split(':')[-1].split(',')[-1]
                else:
                    ret = re.findall('.*?公司|.*?银行', self.textlist[n + 2])
                    if ret:
-                        ret = normalize(ret[0])                        
+                        ret = normalize(ret[0])
-                        if re.sub('\(|\)|（|）','',ret) ==re.sub('\(|\)|（|）','',str(self.issuer())):
+                        if '《' in ret or '》' in ret:
+                            continue
+                        if re.sub(r'\(|\)|（|）','',ret) == re.sub(r'\(|\)|（|）','',str(self.issuer())):
                            continue
-                        return ret         
+                        return ret.split(':')[-1].split(',')[-1]
        for n, text in enumerate(self.textlist[:mainpos]):  
            t = ''.join(self.textlist[:mainpos])
            if not re.search('联席主承销商', t) :
-                ret=''
+                return ''
-                return ret
        for n, text in enumerate(self.textlist[mainpos2:mainpos2+70]):
            t = ''.join(self.textlist[(n+mainpos2):(n+mainpos2+6)])
            if len(t)<55:
                t= ''.join(self.textlist[(n+mainpos2):(n+mainpos2+7)])
            if re.search('联席主承销商.*公司|联席主承销商.*银行', t):
-                if re.search('《',t):
+                if '《' in t or '》' in t:
                    continue
-# =============================================================================
+                # if re.search(self.issuer(),t):
-#                 if re.search(self.issuer(),t):
+                #     continue
-#                     continue
-# =============================================================================
                ret = normalize(t)
                if re.search('银行间市场',ret):
                    continue
-                if re.sub('\(|\)|（|）','',ret) ==re.sub('\(|\)|（|）','',str(self.issuer())):
+                if re.sub(r'\(|\)|（|）','',ret) ==re.sub(r'\(|\)|（|）','',str(self.issuer())):
                    continue
                if re.search('年度第',t):
                    continue
                return ret
-        if ret:
-            if re.sub('\(|\)|（|）','',ret) ==re.sub('\(|\)|（|）','',str(self.issuer())):
-                ret=''    
-        if not ret:
-            ret = ''
-        return ret
+        # if ret:
+        #     if re.sub('\(|\)|（|）','',ret) == re.sub('\(|\)|（|）','',str(self.issuer())):
+        #         ret = ''
+        # if not ret:
+        #     ret = ''
+        # return ret
+        return ''
    def issue_amount(self):
@@ -394,37 +381,38 @@ class poc:
                    ret = text
                    break
                else:
-                    ret = ''.join(self.textlist[n:(n+5)])
+                    ret = ''.join(self.textlist[n:(n+1)])
                    break
        ret = re.findall('本期.*?元|本期.*?亿|发行金额.*?元|发行金额.*?亿', ret)
        if ret:
            ret = ret[0]
        else:
            ret = ''
-        ret = re.sub('[：,，:“”【】 为]', '', ret)
+        ret = re.sub('[：,，:“”【】 为"]', '', ret)
-        if re.search('[0-9]+', ret):
+        if re.search(r'\d+', ret):
-            if '亿' in ret:
+            d_tmp = 1
-                ret = re.sub(u'[\u4e00-\u9fff]', '', ret)
+            for ri in CN_UNIT:
-                try:
+                if ri in ret:
-                    ret = str(float(ret) * 100000000)
+                    d_tmp *= CN_UNIT[ri]
-                except:
+            ret = re.sub(u'[\u4e00-\u9fff]', '', ret)
-                    ret = ''
+            try:
-            else:
+                ret = str(float(ret) * d_tmp)
-                ret = re.sub(u'[\u4e00-\u9fff]', '', ret)
+            except:
-                if len(ret) > 12:
+                ret = ''
-                    ret = ''
+            if len(ret) > 12:
+                ret = ''
        else:
            ret = re.findall(r'[一壹二贰三叁四肆五伍六陆七柒八捌九玖十拾百佰千仟万萬亿億]+', ret)
            if ret:
                ret = ret[0]
            else:
                ret = ''
-        if not re.search('[0-9]+', ret):
+        if not re.search(r'\d+', ret):
            try:
                ret = str(chinese_to_arabic(ret))
            except:
                pass
-        if re.match(r'\d+',ret):   
+        if re.match(r'\d+',ret):
            return str(round(float(ret),2))
        else:
            return ret
@@ -435,7 +423,7 @@ class poc:
            if re.search('期限|发行期限', text):
                if re.search('还本付息', text):
                    continue
-                ret = ''.join(self.textlist[n:(n+4)])
+                ret = ''.join(self.textlist[n:(n+1)])
                break
        ret = re.findall('期限.*?[天年N]', ret)
        if ret:
@@ -444,8 +432,11 @@ class poc:
                ret += '年'
        else:
             ret = ''
-        ret = re.sub('[：,，:“”【】 为]', '', ret)
+        ret = re.sub('[：,，:“”【】 为"]', '', ret)
-        return ret
+        if len(ret) < 10:
+            return ret
+        else:
+            return ''
    def face_value(self):
        _num = r'((\d+\.?\d*)|([十拾]?[一壹二贰三叁四肆五伍六陆七柒八捌九玖][十拾百佰千仟万萬亿億]?))'
@@ -453,10 +444,11 @@ class poc:
        _reg = r'(面值\S{0,10}\s*{})|(票面金额\S{0,10}\s*{})'.replace('{}', _rmb)
        ret = ''
        for n, txt in enumerate(self.textlist):
-            text = ''.join(self.textlist[n:(n+5)])
+            text = ''.join(self.textlist[n:(n+1)])
            tmp = re.search(_reg, text)
            if tmp:
-                if '每股面值' not in text and '工具面值' not in text and '应收账款面值' not in text and '应收款面值' not in text and '认购单位' not in text and '按面值' not in text and '面值总' not in text and '面值）' not in text:
+                # print(tmp.group(0), text)
+                if '每股面值' not in text and '工具面值' not in text and '应收账款面值' not in text and '应收款面值' not in text and '认购单位' not in text and '按面值' not in text and '面值总' not in text and '面值）' not in text and '面值)' not in text:
                    ret = re.search(_num, tmp.group(0)).group(0)
                    break
        if len(ret) > 0 and ret[0] in ['十','拾']:
@@ -489,14 +481,14 @@ class poc:
                if re.search('还本付息', text):
                    continue
-                if (len(text) > 40) & (('2018' not in text) or ('月' not in text) or ('日' not in text)):
+                if (len(text) > 40) and (('2018' not in text) or ('月' not in text) or ('日' not in text)):
                    continue
                if re.search('披露|安排|规定|基本|存续', text):
                    continue
                exam_con=''.join(self.textlist[(n):(n+5)])
                if re.search('起息', exam_con) is False:
                    continue
-                t = ''.join(self.textlist[(n):(n+2)])
+                t = ''.join(self.textlist[n:n+2])
                ##提取当前行及下面两行
                if re.search('中国货币网', t):
                    continue
@@ -507,28 +499,27 @@ class poc:
                    ret = t[(start+1):end]
                    try:
                        if end < len(t) and ((t[end] == '-') or (t[end] == '至') or (t[end] == '、') or (t[end] == '—') or (t[end] == '及')):
-                            a = re.findall('[^\-至、—]*?日', t[(end+1):])
+                            a = re.findall(r'[^\-至、—]*?日', t[(end+1):])
                            if not a:
-                                ret = re.sub('：|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|（|）|发行|变更', '', ret)
+                                ret = re.sub(r'：|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|（|）|发行|变更', '', ret)
-                                print ('aa')
+                                # print ('aa')
                                return ret
                            ret = ret+'至'+a[0]
-    # =============================================================================
+                            # if re.match([0-9],ret) is False:
-    #                         if re.match([0-9],ret) is False:
+                            #    return ''
-    #                             return ''
+                        ret = re.sub(r'：|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|（|）|发行|变更', '', ret)
-    # =============================================================================
-                        ret = re.sub('：|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|（|）|发行|变更', '', ret)
                        tmp_file_name = self.file_name
                        tmp_file_name = re.sub(r'\(.{2}\)','',tmp_file_name)
                        tmp_file_name = re.sub(r'（.{2}）','',tmp_file_name)
                        tmp_file_name = re.sub(r'\.txt','',tmp_file_name)
                        tmp_file_name = re.sub(r'期','',tmp_file_name)
-                        if tmp_file_name in ret:
+                        # if tmp_file_name in ret:
-                            print('########')
+                        #     print('########')
                        ret = ret.replace(tmp_file_name,'')
                        ret = re.sub('起息日','',ret)
                        ret = ret.split(':')[-1]
-                        if re.search('[0-9]+',ret):
+                        if re.search(r'\d+',ret):
+                            ret = ret.split(',')[-1].split(':')[-1]
                            if ret[0] == '日':
                                return ret[1:]
                            else:
@@ -540,6 +531,7 @@ class poc:
                else:
                    print('warning:', t)
                return ret
+        return ''
    def value_date(self):
@@ -569,7 +561,7 @@ class poc:
                    #         ret = re.sub('：|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|（|）|发行|变更', '', ret)
                    #         return ret
                    #     ret = ret+'至'+a[0]
-                    ret = re.sub('：|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|（|）|发行|变更', '', ret)
+                    ret = re.sub(r'：|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|（|）|发行|变更', '', ret)
                    if re.findall('[0-9]+',ret):
                        if re.search('自',ret):
                            ret=ret.split('自')[-1]
@@ -581,6 +573,7 @@ class poc:
                    print('warning:', t)
                return ret
+        return ''
 def txt_handle_interface(txt_path):
    from os import path
@@ -593,10 +586,7 @@ if __name__ == '__main__':
    # single instance:
-# =============================================================================
+    folder='D:\\qqFILE\\result\\full'
-#     folder= './full'
-# =============================================================================
-    folder= 'D:\\qqFILE\\result\\full'
    filelist = os.listdir(folder)
    filelist = [file for file in filelist if re.search('.txt', file)]
    # file = filelist[0]
@@ -607,7 +597,6 @@ if __name__ == '__main__':
    for n, file in enumerate(filelist[:]):
        # try:
-        print(n)
        p = poc(folder, file)
        data.append(p.data)
        # except Exception as err:

--- a/30.PdfToJson/接口调用截图.png
+++ b/30.PdfToJson/接口调用截图.png
--- a/30.PdfToJson/部署文档.txt
+++ b/30.PdfToJson/部署文档.txt
+http://poc.seekxun.com/
+数据库在mech.gimind.com下面的pdftojson数据库中
+________过去的，已经不用这个了__________
+test.seekxun.com
+222.73.241.5
+密码taikor
\ No newline at end of file