Commit 1a78ef03 by mahaisong

feat:新增

parents


Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
VisualStudioVersion = 14.0.25420.1
MinimumVisualStudioVersion = 10.0.40219.1
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "PythonApplication1", "PythonApplication1\PythonApplication1.pyproj", "{DABC086C-2EBA-4335-A981-588FEFF3445F}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{DABC086C-2EBA-4335-A981-588FEFF3445F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{DABC086C-2EBA-4335-A981-588FEFF3445F}.Release|Any CPU.ActiveCfg = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys, urllib, urllib2, json, ssl
from datetime import datetime,timedelta
class AnalyzeData:
def __init__(self):
self.__IssueID=""; #议题编号
#归并入Issue的方式
#0从Issue设置的Crawl中主动抓取,自然归并入Issue
#1并入Topic
#2并入Subject
#3 涉及地区而并入
#4涉及产品而并入
#5涉及组织结构和单位而并入
#6涉及人物而并入
#7涉及行业而并入
#8涉及分类而并入
#10手工设定
#11其他情况
self.__MergeMethod=0;
#用户分类ID(多个string)
self.__CategoryIDs=[];
#用户部门ID(多个string)
self.__DepartmentIDs=[];
#行业ID(多个string)
self.__IndustryIDs=[];
#产品名称(多个,"品牌-系列-品名-型号-类型"路径,用空格或,;隔开,分词索引)其中类型为新加,特殊字段,不和前四个形成继承关系
self.__Product=[];
#用户话题ID(多个string)
self.__TopicIDs=[];
#组织机构ID(多个string)
self.__OrganizationIDs=[];
#用户人物名称(多个,分词索引string)
self.__People=[];
#地区名称(多个,"省-市-区县-街道村镇"路径,用空格或,;隔开,分词索引)
self.__Region=[];
#关键词提取string
self.__Keywords=[];
#属性评分的结果(多个,AttributeEvaluationResult类对象)
self. __Evaluation=[];
#品类/市场编号(多个string)
self.__MarketIDs=[];
#主题/板块编号(多个string)
self.__BlockIDs=[];
#股票编号(多个string)
self.__StockIDs=[];
#相关事件编号string
self.__RelatedEventID="";
#转载编号string
self.__DuplicationID="";
#region 已分析过的标志bool True==1 False==0
self.__ProsdMedia=0;
self.__ProsdSplite=0;
self.__ProsdTopic=0;
self.__ProsdCategory=0;
self.__ProsdDepartment=0;
self.__ProsdIndustry=0;
self.__ProsdProduct=0;
self.__ProsdPeople=0;
self.__ProsdOrganization=0;
self.__ProsdRegion=0;
self.__ProsdSentiment=0;
self.__ProsdSentiment8=0;
self.__ProsdSentiment2=0;
self.__ProsdKeyword=0;
self.__prosdIG=0;
#endregion 已分析过的标志bool
#region 情感分析
#正负情感(1 ~ 3),1:正面;2:中性;3:负面
self.__Sentiment=0.0;
#高兴
self.__EmotionHappy =0.0;
self.__EmotionAccept=0.0;
self.__EmotionSurprise=0.0;
self.__EmotionFear=0.0;
self.__EmotionSad=0.0;
self.__EmotionHate=0.0;
self.__EmotionExpect=0.0;
self.__EmotionAngry=0.0;
self.__EmotionPositive=0.0;
self.__EmotionNegative=0.0;
#IG Tree string
self.__IG="";
#region
@property
def IG(self):
return self.__IG
@IG.setter
def IG(self,IG):
self.__IG = str(IG)
@property
def EmotionNegative(self):
return self.__EmotionNegative
@EmotionNegative.setter
def EmotionNegative(self,EmotionNegative):
self.__EmotionNegative = float(EmotionNegative)
@property
def EmotionPositive(self):
return self.__EmotionPositive
@EmotionPositive.setter
def EmotionPositive(self,EmotionPositive):
self.__EmotionPositive = float(EmotionPositive)
@property
def EmotionAngry(self):
return self.__EmotionAngry
@EmotionAngry.setter
def EmotionAngry(self,EmotionAngry):
self.__EmotionAngry = float(EmotionAngry)
@property
def EmotionExpect(self):
return self.__EmotionExpect
@EmotionExpect.setter
def EmotionExpect(self,EmotionExpect):
self.__EmotionExpect = float(EmotionExpect)
@property
def EmotionHate(self):
return self.__EmotionHate
@EmotionHate.setter
def EmotionHate(self,EmotionHate):
self.__EmotionHate = float(EmotionHate)
@property
def EmotionSad(self):
return self.__EmotionSad
@EmotionSad.setter
def EmotionSad(self,EmotionSad):
self.__EmotionSad = float(EmotionSad)
@property
def EmotionFear(self):
return self.__EmotionFear
@EmotionFear.setter
def EmotionFear(self,EmotionFear):
self.__EmotionFear = float(EmotionFear)
@property
def EmotionSurprise(self):
return self.__EmotionSurprise
@EmotionSurprise.setter
def EmotionSurprise(self,EmotionSurprise):
self.__EmotionSurprise = float(EmotionSurprise)
@property
def EmotionAccept(self):
return self.__EmotionAccept
@EmotionAccept.setter
def EmotionAccept(self,EmotionAccept):
self.__EmotionAccept = float(EmotionAccept)
@property
def EmotionHappy(self):
return self.__EmotionHappy
@EmotionHappy.setter
def EmotionHappy(self,EmotionHappy):
self.__EmotionHappy = float(EmotionHappy)
@property
def Sentiment(self):
return self.__Sentiment
@Sentiment.setter
def Sentiment(self,Sentiment):
self.__Sentiment = float(Sentiment)
@property
def prosdIG(self):
return self.__prosdIG
@prosdIG.setter
def prosdIG(self,prosdIG):
self.__prosdIG = int(prosdIG)
@property
def ProsdKeyword(self):
return self.__ProsdKeyword
@ProsdKeyword.setter
def ProsdKeyword(self,ProsdKeyword):
self.__ProsdKeyword = int(ProsdKeyword)
@property
def ProsdSentiment2(self):
return self.__ProsdSentiment2
@ProsdSentiment2.setter
def ProsdSentiment2(self,ProsdSentiment2):
self.__ProsdSentiment2 = int(ProsdSentiment2)
@property
def ProsdSentiment8(self):
return self.__ProsdSentiment8
@ProsdSentiment8.setter
def ProsdSentiment8(self,ProsdSentiment8):
self.__ProsdSentiment8 = int(ProsdSentiment8)
@property
def ProsdSentiment(self):
return self.__ProsdSentiment
@ProsdSentiment.setter
def ProsdSentiment(self,ProsdSentiment):
self.__ProsdSentiment = int(ProsdSentiment)
@property
def ProsdRegion(self):
return self.__ProsdRegion
@ProsdRegion.setter
def ProsdRegion(self,ProsdRegion):
self.__ProsdRegion = int(ProsdRegion)
@property
def ProsdOrganization(self):
return self.__ProsdOrganization
@ProsdOrganization.setter
def ProsdOrganization(self,ProsdOrganization):
self.__ProsdOrganization = int(ProsdOrganization)
@property
def ProsdPeople(self):
return self.__ProsdPeople
@ProsdPeople.setter
def ProsdPeople(self,ProsdPeople):
self.__ProsdPeople = int(ProsdPeople)
@property
def ProsdProduct(self):
return self.__ProsdProduct
@ProsdProduct.setter
def ProsdProduct(self,ProsdProduct):
self.__ProsdProduct = int(ProsdProduct)
@property
def ProsdIndustry(self):
return self.__ProsdIndustry
@ProsdIndustry.setter
def ProsdIndustry(self,ProsdIndustry):
self.__ProsdIndustry = int(ProsdIndustry)
@property
def ProsdDepartment(self):
return self.__ProsdDepartment
@ProsdDepartment.setter
def ProsdDepartment(self,ProsdDepartment):
self.__ProsdDepartment = int(ProsdDepartment)
@property
def ProsdCategory(self):
return self.__ProsdCategory
@ProsdCategory.setter
def ProsdCategory(self,ProsdCategory):
self.__ProsdCategory = int(ProsdCategory)
@property
def ProsdTopic(self):
return self.__ProsdTopic
@ProsdTopic.setter
def ProsdTopic(self,ProsdTopic):
self.__ProsdTopic = int(ProsdTopic)
@property
def ProsdSplite(self):
return self.__ProsdSplite
@ProsdSplite.setter
def ProsdSplite(self,ProsdSplite):
self.__ProsdSplite = int(ProsdSplite)
@property
def ProsdMedia(self):
return self.__ProsdMedia
@ProsdMedia.setter
def ProsdMedia(self,ProsdMedia):
self.__ProsdMedia = int(ProsdMedia)
@property
def DuplicationID(self):
return self.__DuplicationID
@DuplicationID.setter
def DuplicationID(self,DuplicationID):
self.__DuplicationID = str(DuplicationID)
@property
def RelatedEventID(self):
return self.__RelatedEventID
@RelatedEventID.setter
def RelatedEventID(self,RelatedEventID):
self.__RelatedEventID = str(RelatedEventID)
@property
def StockIDs(self):
return self.__StockIDs
@StockIDs.setter
def StockIDs(self,StockIDs):
self.__StockIDs = [](StockIDs)
@property
def BlockIDs(self):
return self.__BlockIDs
@BlockIDs.setter
def BlockIDs(self,BlockIDs):
self.__BlockIDs = [](BlockIDs)
@property
def MarketIDs(self):
return self.__MarketIDs
@MarketIDs.setter
def MarketIDs(self,MarketIDs):
self.__MarketIDs = [](MarketIDs)
@property
def Evaluation(self):
return self.__Evaluation
@Evaluation.setter
def Evaluation(self,Evaluation):
self.__Evaluation = [](Evaluation)
@property
def Keywords(self):
return self.__Keywords
@Keywords.setter
def Keywords(self,Keywords):
self.__Keywords = [](Keywords)
@property
def Region(self):
return self.__Region
@Region.setter
def Region(self,Region):
self.__Region = [](Region)
@property
def People(self):
return self.__People
@People.setter
def People(self,People):
self.__People = [](People)
@property
def OrganizationIDs(self):
return self.__OrganizationIDs
@OrganizationIDs.setter
def OrganizationIDs(self,OrganizationIDs):
self.__OrganizationIDs = [](OrganizationIDs)
@property
def TopicIDs(self):
return self.__TopicIDs
@TopicIDs.setter
def TopicIDs(self,TopicIDs):
self.__TopicIDs = [](TopicIDs)
@property
def Product(self):
return self.__Product
@Product.setter
def Product(self,Product):
self.__Product = [](Product)
@property
def IndustryIDs(self):
return self.__IndustryIDs
@IndustryIDs.setter
def IndustryIDs(self,IndustryIDs):
self.__IndustryIDs = [](IndustryIDs)
@property
def DepartmentIDs(self):
return self.__DepartmentIDs
@DepartmentIDs.setter
def DepartmentIDs(self,DepartmentIDs):
self.__DepartmentIDs = [](DepartmentIDs)
@property
def CategoryIDs(self):
return self.__CategoryIDs
@CategoryIDs.setter
def CategoryIDs(self,CategoryIDs):
self.__CategoryIDs = [](CategoryIDs)
@property
def MergeMethod(self):
return self.__MergeMethod
@MergeMethod.setter
def MergeMethod(self,MergeMethod):
self.__MergeMethod = int(MergeMethod)
@property
def IssueID(self):
return self.__IssueID
@IssueID.setter
def IssueID(self,IssueID):
self.__IssueID = str(IssueID)
#endregion
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
class AttributeEvaluationResult:
def __init__(self, Attribute,Value,MatchedWords):
#属性名称string
self.__Attribute = Attribute
#该属性的得分float
self.__Value = Value
#命中该属性的词,多个string
self.__MatchedWords = MatchedWords
@property
def Attribute(self):
return self.__Attribute
@Attribute.setter
def Attribute(self,Attribute):
self.__Attribute = str(Attribute)
@property
def MatchedWords(self):
return self.__MatchedWords
@MatchedWords.setter
def MatchedWords(self,MatchedWords):
self.__MatchedWords = [](MatchedWords)
@property
def Value(self):
return self.__Value
@Value.setter
def Value(self,Value):
self.__Value = float(Value)
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
sys.path.append(r'C:\\Users\\admin\\Documents\\Visual Studio 2015\\Projects\\PythonApplication1\\PythonApplication1')
from Model.AnalyzeData import *
from Model.AttributeEvaluationResult import *
from Model.ItemTracking import *
from Model.ItemCountData import *
from Model.ItemReply import *
from Model.ResearchReport import *
from Model.ItemLabelData import *
from Model.Structured import *
class Item:
def __init__(self):
self.__ItemAnalyzeDataList = [];#语义分析结果字段 包含多个AnalyzeData对象
self.__UpdateTime = "";#该条目信息本系统最后更新的时间,string类型需要转义使用。或程序需要,在赋值时转义为time类型。
self.__IndexTime = "";#该条目信息的存储时间,用于分析分析效率与计算抓取质量
#内容详细程度
#0仅有标题(的网页)
#1微博
#2标题和摘要
#3首页内容
#4所有页面内容我不确定是否需要区分首页和绝对的全文--Chol
#5评论类型
self.__ContentDetailLevel = 0;
#文章ID
self.__ItemID = "";
#客户系统或对方网站的ItemID
self.__ClientItemID = "";
#父级ItemID
self.__ParentItemID = "";
#文章Url
self.__Url = "";
#标题
self.__CleanTitle = "";
#短标题
self.__ShortTitle = "";
#正文
self.__CleanText = "";
#HTML正文(如需要)
self.__HTMLText = "" ;
#摘要
self.__Summary = "";
#页面显示发布时间,string类型需要转义使用。或程序需要,在赋值时转义为time类型。
self.__PubDate = "";
#作者名(分词索引)
self.__AuthorName = "";
#作者ID
self.__AuthorID = "";
#Author.Tag字段,用于区分人
self.__AuthorTag = "";
#显示来源(比如使用终端等)
self.__Source = "";
#转载的ID,如果有的话
self.__RetweetID = "";
#region 媒体信息
#媒体编号ID
self.__MediaID = "";
#媒体名称(分词索引)
self.__MediaName = "";
#频道/版面
self.__MediaChannel = "";
#在媒体中的排序,用以表示顺序或者重要性(例如头版、首页之类) int
self.__Rank = 0;
#媒体类型(网站|论坛|博客……)
#0其他未知
#1报刊杂志报刊
#2广播电视电台电视台
#3新闻网络媒体(新闻性)
#4论坛
#5博客
#6微博
#7社交网站社交网
#8电子商务电子商务/供求类网站
#9视频网站网络视频
#10百科问答
#11微信
#20网页搜索Web搜索引擎
#23新闻搜索News搜索引擎
#24论坛搜索
#25博客搜索
#26微博搜索
#27社交网站搜索
#28图片搜索
#29视频搜索
#30百科搜索百科问答
#31新闻项目
#32"组织机构网站
#100内部数据内部数据(企业局域网)
#101行业媒体,无此类别,仅供统计时使用
self.__MediaType = 0;
#媒体发行范围(全国|地方|海外)
#0未知
#1全国
#2地方
#3海外
self.__MediaRegionType = 0;
#媒体权重,主流媒体等 SByte
self.__MediaWeight = 0;
#媒体性质(党报|市场化)
#0未知
#1行政类媒体(党报)
#2市场化媒体(非极端)
#3极端反华
#4极端红色
self.__MediaOrganType = 0;
#语体类型(新闻|评论|转发……)
#0未知
#1新闻
#2评论
self.__MediaStyle = 0;
#政治倾向性(-3反动 -2极左 -1左倾 0中立 1右倾 2极右 3极端右)SByte
self.__MediaTendency = 0;
#媒体行业属性
self.__MediaIndustryIDs = "";
#Media.MediaTag用于区分媒体集合
self.__MediaTag = "";
#代理分区 >0表示要翻墙 SByte
self.__ProxyZone = 0;
#原始媒体ID(如果在媒体库中找到匹配)
self.__ReproducedMediaID = "";
#原始媒体名称
self.__ReproducedMediaName = "";
#原始媒体的Url
self.__ReproducedUrl = "";
#该媒体版权int;0 未知,11 黑名单,12 无版权,13 不允许转载,14 拒绝抓取,20 白名单,21 采购可转授,22 自编,23
#开放可商业转载,31 采购需单独授权,32 用户自有版权,33 用户自行承担版权,41 非商业使用
self.__MediaCopyright = 0;
#endregion 媒体信息
#region Item跟踪
#跟踪设置和信息(评论) ItemTracking对象1个
self.__Tracking = "";
#跟踪设置和信息(转发) ItemTracking对象1个
self.__Tracking_Forward = "";
#最新的点击数、评论数、转发数 ItemCountData对象1个
self.__CurrentCount = "";
#定时获取的点击数、评论数、转发数(存ES) ItemCountData[] 多个
self.__CountHistory = [];
#回复数据 ItemReply[]
self.__Reply = [];
#endregion Item跟踪
#region 扩展数据
#相似转载ID(早先入库的相似文章ID,第一篇该字段为空,其他均为第一篇的ID) string
self.__DuplicationID = "";
#分词结果
self.__SpliteTitle = "";
#分词结果
self.__SpliteText = "";
#是否进行过转载分析bool
self.__ProsdDuplication = 0;
#通用标签,根据各实体库中的Tag字段设置
self.__Tag = "";
#研报 ResearchReport 1个对象 先初始为空string
self.__ResearchReport = "";
#语料标注字段,每个客户一条记录(支持不同的标注标准) ItemLabelData[]
self.__LabelData = [];
#原文是否已被删除 bool
self.__IsOriginalDeleted = 0;
#是否是epr公司发布的软广 bool
self.__IsEPRPublished = 0;
#endregion
#region 文本结构化信息
#文本结构化信息 Structured对象1个
self.__Struct = "";
#endregion
#region
@property
def Struct(self):
return self.__Struct
@Struct.setter
def Struct(self,Struct):
self.__Struct = Structured(Struct)
@property
def IsEPRPublished(self):
return self.__IsEPRPublished
@IsEPRPublished.setter
def IsEPRPublished(self,IsEPRPublished):
self.__IsEPRPublished = int(IsEPRPublished)
@property
def IsOriginalDeleted(self):
return self.__IsOriginalDeleted
@IsOriginalDeleted.setter
def IsOriginalDeleted(self,IsOriginalDeleted):
self.__IsOriginalDeleted = int(IsOriginalDeleted)
@property
def LabelData(self):
return self.__LabelData
@LabelData.setter
def LabelData(self,LabelData):
self.__LabelData = [](LabelData)
@property
def ResearchReport(self):
return self.__ResearchReport
@ResearchReport.setter
def ResearchReport(self,ResearchReport):
self.__ResearchReport = ResearchReport(ResearchReport)
@property
def Tag(self):
return self.__Tag
@Tag.setter
def Tag(self,Tag):
self.__Tag = str(Tag)
@property
def ProsdDuplication(self):
return self.__ProsdDuplication
@ProsdDuplication.setter
def ProsdDuplication(self,ProsdDuplication):
self.__ProsdDuplication = int(ProsdDuplication)
@property
def SpliteText(self):
return self.__SpliteText
@SpliteText.setter
def SpliteText(self,SpliteText):
self.__SpliteText = str(SpliteText)
@property
def SpliteTitle(self):
return self.__SpliteTitle
@SpliteTitle.setter
def SpliteTitle(self,SpliteTitle):
self.__SpliteTitle = str(SpliteTitle)
@property
def DuplicationID(self):
return self.__DuplicationID
@DuplicationID.setter
def DuplicationID(self,DuplicationID):
self.__DuplicationID = str(DuplicationID)
@property
def Reply(self):
return self.__Reply
@Reply.setter
def Reply(self,Reply):
self.__Reply = [](Reply)
@property
def CountHistory(self):
return self.__CountHistory
@CountHistory.setter
def CountHistory(self,CountHistory):
self.__CountHistory = [](CountHistory)
@property
def CurrentCount(self):
return self.__CurrentCount
@CurrentCount.setter
def CurrentCount(self,CurrentCount):
self.__CurrentCount = ItemCountData(CurrentCount)
@property
def Tracking_Forward(self):
return self.__Tracking_Forward
@Tracking_Forward.setter
def Tracking_Forward(self,Tracking_Forward):
self.__Tracking_Forward = ItemTracking(Tracking_Forward)
@property
def Tracking(self):
return self.__Tracking
@Tracking.setter
def Tracking(self,Tracking):
self.__Tracking = ItemTracking(Tracking)
@property
def MediaCopyright(self):
return self.__MediaCopyright
@MediaCopyright.setter
def MediaCopyright(self,MediaCopyright):
self.__MediaCopyright = int(MediaCopyright)
@property
def ReproducedUrl(self):
return self.__ReproducedUrl
@ReproducedUrl.setter
def ReproducedUrl(self,ReproducedUrl):
self.__ReproducedUrl = str(ReproducedUrl)
@property
def ReproducedMediaName(self):
return self.__ReproducedMediaName
@ReproducedMediaName.setter
def ReproducedMediaName(self,ReproducedMediaName):
self.__ReproducedMediaName = str(ReproducedMediaName)
@property
def ReproducedMediaID(self):
return self.__ReproducedMediaID
@ReproducedMediaID.setter
def ReproducedMediaID(self,ReproducedMediaID):
self.__ReproducedMediaID = str(ReproducedMediaID)
@property
def ProxyZone(self):
return self.__ProxyZone
@ProxyZone.setter
def ProxyZone(self,ProxyZone):
self.__ProxyZone = int(ProxyZone)
@property
def MediaTag(self):
return self.__MediaTag
@MediaTag.setter
def MediaTag(self,MediaTag):
self.__MediaTag = str(MediaTag)
@property
def MediaIndustryIDs(self):
return self.__MediaIndustryIDs
@MediaIndustryIDs.setter
def MediaIndustryIDs(self,MediaIndustryIDs):
self.__MediaIndustryIDs = str(MediaIndustryIDs)
@property
def MediaTendency(self):
return self.__MediaTendency
@MediaTendency.setter
def MediaTendency(self,MediaTendency):
self.__MediaTendency = int(MediaTendency)
@property
def MediaStyle(self):
return self.__MediaStyle
@MediaStyle.setter
def MediaStyle(self,MediaStyle):
self.__MediaStyle = int(MediaStyle)
@property
def MediaOrganType(self):
return self.__MediaOrganType
@MediaOrganType.setter
def MediaOrganType(self,MediaOrganType):
self.__MediaOrganType = int(MediaOrganType)
@property
def MediaWeight(self):
return self.__MediaWeight
@MediaWeight.setter
def MediaWeight(self,MediaWeight):
self.__MediaWeight = int(MediaWeight)
@property
def MediaRegionType(self):
return self.__MediaRegionType
@MediaRegionType.setter
def MediaRegionType(self,MediaRegionType):
self.__MediaRegionType = int(MediaRegionType)
@property
def MediaType(self):
return self.__MediaType
@MediaType.setter
def MediaType(self,MediaType):
self.__MediaType = int(MediaType)
@property
def Rank(self):
return self.__Rank
@Rank.setter
def Rank(self,Rank):
self.__Rank = int(Rank)
@property
def MediaChannel(self):
return self.__MediaChannel
@MediaChannel.setter
def MediaChannel(self,MediaChannel):
self.__MediaChannel = str(MediaChannel)
@property
def MediaName(self):
return self.__MediaName
@MediaName.setter
def MediaName(self,MediaName):
self.__MediaName = str(MediaName)
@property
def MediaID(self):
return self.__MediaID
@MediaID.setter
def MediaID(self,MediaID):
self.__MediaID = str(MediaID)
@property
def RetweetID(self):
return self.__RetweetID
@RetweetID.setter
def RetweetID(self,RetweetID):
self.__RetweetID = str(RetweetID)
@property
def Source(self):
return self.__Source
@Source.setter
def Source(self,Source):
self.__Source = str(Source)
@property
def AuthorTag(self):
return self.__AuthorTag
@AuthorTag.setter
def AuthorTag(self,AuthorTag):
self.__AuthorTag = str(AuthorTag)
@property
def AuthorID(self):
return self.__AuthorID
@AuthorID.setter
def AuthorID(self,AuthorID):
self.__AuthorID = str(AuthorID)
@property
def AuthorName(self):
return self.__AuthorName
@AuthorName.setter
def AuthorName(self,AuthorName):
self.__AuthorName = str(AuthorName)
@property
def PubDate(self):
return self.__PubDate
@PubDate.setter
def PubDate(self,PubDate):
self.__PubDate = str(PubDate)
@property
def Summary(self):
return self.__Summary
@Summary.setter
def Summary(self,Summary):
self.__Summary = str(Summary)
@property
def HTMLText(self):
return self.__HTMLText
@HTMLText.setter
def HTMLText(self,HTMLText):
self.__HTMLText = str(HTMLText)
@property
def CleanText(self):
return self.__CleanText
@CleanText.setter
def CleanText(self,CleanText):
self.__CleanText = str(CleanText)
@property
def ShortTitle(self):
return self.__ShortTitle
@ShortTitle.setter
def ShortTitle(self,ShortTitle):
self.__ShortTitle = str(ShortTitle)
@property
def CleanTitle(self):
return self.__CleanTitle
@CleanTitle.setter
def CleanTitle(self,CleanTitle):
self.__CleanTitle = str(CleanTitle)
@property
def Url(self):
return self.__Url
@Url.setter
def Url(self,Url):
self.__Url = str(Url)
@property
def ParentItemID(self):
return self.__ParentItemID
@ParentItemID.setter
def ParentItemID(self,ParentItemID):
self.__ParentItemID = str(ParentItemID)
@property
def ItemAnalyzeDataList(self):
return self.__ItemAnalyzeDataList
@ItemAnalyzeDataList.setter
def ItemAnalyzeDataList(self,ItemAnalyzeDataList):
self.__ItemAnalyzeDataList = [](ItemAnalyzeDataList)
@property
def UpdateTime(self):
return self.__UpdateTime
@UpdateTime.setter
def UpdateTime(self,UpdateTime):
self.__UpdateTime = str(UpdateTime)
@property
def IndexTime(self):
return self.__IndexTime
@IndexTime.setter
def IndexTime(self,IndexTime):
self.__IndexTime = str(IndexTime)
@property
def ContentDetailLevel(self):
return self.__ContentDetailLevel
@ContentDetailLevel.setter
def ContentDetailLevel(self,ContentDetailLevel):
self.__ContentDetailLevel = int(ContentDetailLevel)
@property
def ItemID(self):
return self.__ItemID
@ItemID.setter
def ItemID(self,ItemID):
self.__ItemID = str(ItemID)
@property
def ClientItemID(self):
return self.__ClientItemID
@ClientItemID.setter
def ClientItemID(self,ClientItemID):
self.__ClientItemID = str(ClientItemID)
#endregion
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
class ItemCountData:
def __init__(self, ViewCount,ReplyCount,ForwardCount,FetchTime):
self.__ViewCount = ViewCount
self.__ReplyCount = ReplyCount
self.__ForwardCount = ForwardCount
self.__FetchTime = FetchTime #DateTime
@property
def ViewCount(self):
return self.__ViewCount
@ViewCount.setter
def ViewCount(self,ViewCount):
self.__ViewCount = int(ViewCount)
@property
def ReplyCount(self):
return self.__ReplyCount
@ReplyCount.setter
def ReplyCount(self,ReplyCount):
self.__ReplyCount = int(ReplyCount)
@property
def ForwardCount(self):
return self.__ForwardCount
@ForwardCount.setter
def ForwardCount(self,ForwardCount):
self.__ForwardCount = int(ForwardCount)
@property
def FetchTime(self):
return self.__FetchTime
@FetchTime.setter
def FetchTime(self,FetchTime):
self.__FetchTime = str(FetchTime)
\ No newline at end of file
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys, urllib, urllib2, json, ssl
from datetime import datetime,timedelta
class ItemLabelData:
def __init__(self, IssueID):
self.__IssueID=IssueID; #议题编号
@property
def IssueID(self):
return self.__IssueID
@IssueID.setter
def IssueID(self,IssueID):
self.__IssueID = str(IssueID)
\ No newline at end of file
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
class ItemReply:
def __init__(self, ItemID,CleanText,FetchTime,PubDate,AuthorName,AuthorID,Location,Source,Mark):
self.__ItemID = ItemID#所回复的条目
self.__CleanText = CleanText #内容
self.__FetchTime = FetchTime #抓取时间 DateTime
self.__PubDate = PubDate#页面显示发布时间 DateTime
self.__AuthorName = AuthorName#作者名
self.__AuthorID = AuthorID #作者ID
self.__Location = Location#作者或内容的地点属性
self.__Source = Source #显示来源(比如使用终端等)
self.__Mark = Mark#星标(多级别) SByte
@property
def ItemID(self):
return self.__ItemID
@ItemID.setter
def ItemID(self,ItemID):
self.__ItemID = str(ItemID)
@property
def CleanText(self):
return self.__CleanText
@CleanText.setter
def CleanText(self,CleanText):
self.__CleanText = str(CleanText)
@property
def FetchTime(self):
return self.__FetchTime
@FetchTime.setter
def FetchTime(self,FetchTime):
self.__FetchTime = str(FetchTime)
@property
def PubDate(self):
return self.__PubDate
@PubDate.setter
def PubDate(self,PubDate):
self.__PubDate = str(PubDate)
@property
def AuthorName(self):
return self.__AuthorName
@AuthorName.setter
def AuthorName(self,AuthorName):
self.__AuthorName = str(AuthorName)
@property
def AuthorID(self):
return self.__AuthorID
@AuthorID.setter
def AuthorID(self,AuthorID):
self.__AuthorID = str(AuthorID)
@property
def Location(self):
return self.__Location
@Location.setter
def Location(self,Location):
self.__Location = str(Location)
@property
def Source(self):
return self.__Source
@Source.setter
def Source(self,Source):
self.__Source = str(Source)
@property
def Mark(self):
return self.__Mark
@Mark.setter
def Mark(self,Mark):
self.__Mark = int(Mark)
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys, urllib, urllib2, json, ssl
from datetime import datetime,timedelta
class ItemTracking:
def __init__(self, FollowStatus,FollowNextTime,FollowPriority,LastReplyUrl,LastReplyPageNo,LastReply_SinceID,LastReplyDate,LastReplyAuthorID,LastReplyAuthorName,ReplyCount,FollowCount,FollowErrorCount):
#抓取任务的状态
#0正常状态,等待NextCrawlTime执行抓取
#1(人为)停止标志
#2进行中(LockTimeout后才可以重置)
#3超过连续错误次数限制后,自动标志为错误并停止抓取
#4在队列中等待处理
self.__FollowStatus=FollowStatus;
self.__FollowNextTime=FollowNextTime;#FollowNextTime DateTime
self.__FollowPriority=FollowPriority; #Follow任务的优先级 sbyte
self.__LastReplyUrl=LastReplyUrl; #最后跟踪到的评论页Url(用于PageByPage或PageByPageTimeASCE)
self.__LastReplyPageNo=LastReplyPageNo; #最后跟踪到的页码(用于FormatStringTimeASCE)
self.__LastReply_SinceID=LastReply_SinceID; #最后一次同步的评论的ID
self.__LastReplyDate=LastReplyDate; #最后回复日期 DateTime
self.__LastReplyAuthorID=LastReplyAuthorID; #最后回复作者ID
self.__LastReplyAuthorName=LastReplyAuthorName; #最后回复作者名
self.__ReplyCount=ReplyCount;#已入库评论数
self.__FollowCount=FollowCount; #跟踪次数(不算第一次抓到)
self.__FollowErrorCount=FollowErrorCount; #跟踪错误数
@property
def FollowStatus(self):
return self.__FollowStatus
@FollowStatus.setter
def FollowStatus(self,FollowStatus):
self.__FollowStatus = int(FollowStatus)
@property
def FollowNextTime(self):
return self.__FollowNextTime
@FollowNextTime.setter
def FollowNextTime(self,FollowNextTime):
self.__FollowNextTime = str(FollowNextTime)
@property
def FollowPriority(self):
return self.__FollowPriority
@FollowPriority.setter
def FollowPriority(self,FollowPriority):
self.__FollowPriority = int(FollowPriority)
@property
def LastReplyUrl(self):
return self.__LastReplyUrl
@LastReplyUrl.setter
def LastReplyUrl(self,LastReplyUrl):
self.__LastReplyUrl = str(LastReplyUrl)
@property
def LastReplyPageNo(self):
return self.__LastReplyPageNo
@LastReplyPageNo.setter
def LastReplyPageNo(self,LastReplyPageNo):
self.__LastReplyPageNo = int(LastReplyPageNo)
@property
def LastReply_SinceID(self):
return self.__LastReply_SinceID
@LastReply_SinceID.setter
def LastReply_SinceID(self,LastReply_SinceID):
self.__LastReply_SinceID = str(LastReply_SinceID)
@property
def LastReplyDate(self):
return self.__LastReplyDate
@LastReplyDate.setter
def LastReplyDate(self,LastReplyDate):
self.__LastReplyDate = str(LastReplyDate)
@property
def LastReplyAuthorID(self):
return self.__LastReplyAuthorID
@LastReplyAuthorID.setter
def LastReplyAuthorID(self,LastReplyAuthorID):
self.__LastReplyAuthorID = str(LastReplyAuthorID)
@property
def LastReplyAuthorName(self):
return self.__LastReplyAuthorName
@LastReplyAuthorName.setter
def LastReplyAuthorName(self,LastReplyAuthorName):
self.__LastReplyAuthorName = str(LastReplyAuthorName)
@property
def ReplyCount(self):
return self.__ReplyCount
@ReplyCount.setter
def ReplyCount(self,ReplyCount):
self.__ReplyCount = int(ReplyCount)
@property
def FollowCount(self):
return self.__FollowCount
@FollowCount.setter
def FollowCount(self,FollowCount):
self.__FollowCount = int(FollowCount)
@property
def FollowErrorCount(self):
return self.__FollowErrorCount
@FollowErrorCount.setter
def FollowErrorCount(self,FollowErrorCount):
self.__FollowErrorCount = int(FollowErrorCount)
\ No newline at end of file
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys, urllib, urllib2, json, ssl
from datetime import datetime,timedelta
class ResearchReport:
def __init__(self, ReportType,ReportAgency):
self.__ReportType=ReportType; #研报类型
self.__ReportAgency=ReportAgency;#研报机构
@property
def ReportType(self):
return self.__ReportType
@ReportType.setter
def ReportType(self,ReportType):
self.__ReportType = str(ReportType)
@property
def ReportAgency(self):
return self.__ReportAgency
@ReportAgency.setter
def ReportAgency(self,ReportAgency):
self.__ReportAgency = str(ReportAgency)
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
class Structured:
def __init__(self, StructId,Category,Subject,Keypoint,MarketImpact):
self.__StructId = StructId #结构化编号,和分类相关
self.__Category = Category #结构化类别
self.__Subject = Subject #文章主题,暂时仅选择一个啦
self.__Keypoint = Keypoint#内容关键点
self.__MarketImpact = MarketImpact #市场影响
@property
def StructId(self):
return self.__StructId
@StructId.setter
def StructId(self,StructId):
self.__StructId = str(StructId)
@property
def Category(self):
return self.__Category
@Category.setter
def Category(self,Category):
self.__Category = str(Category)
@property
def Subject(self):
return self.__Subject
@Subject.setter
def Subject(self,Subject):
self.__Subject = str(Subject)
@property
def Keypoint(self):
return self.__Keypoint
@Keypoint.setter
def Keypoint(self,Keypoint):
self.__Keypoint = str(Keypoint)
@property
def MarketImpact(self):
return self.__MarketImpact
@MarketImpact.setter
def MarketImpact(self,MarketImpact):
self.__MarketImpact = str(MarketImpact)
\ No newline at end of file
++ "a/4.\346\216\245\345\217\243\351\205\215\345\220\210\345\255\220\345\245\207\345\267\245\344\275\234/PythonApplication1/PythonApplication1/Model/__init__.py"
基本数据结构定义说明:
基本数据结构定义说明:
1.在PythonApplication1中每次访问获得的100条都会放在ItemList中。
2.ItemList内部会有100个Item类对象。
3.Item类对象中含有基本属性字段、1个ItemAnalyzeDataList列表(集合), ItemAnalyzeDataList中包含多条AnalyzeData
4.AnalyzeData类对象中含有基本属性字段
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
class Person():
#结构化编号,和分类相关
__StructId ="";
#结构化类别
__Category ="";
#文章主题,暂时仅选择一个啦
__Subject ="";
#内容关键点
__Keypoint ="";
#市场影响
__MarketImpact ="";
#!/usr/bin/python
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import sys, time, json
from datetime import datetime,timedelta
sys.path.append(r'C:\\Users\\admin\\Documents\\Visual Studio 2015\\Projects\\PythonApplication1\\PythonApplication1')
from taikoropensdk import TaikorOauthClient
from Model.Item import *
from Model.AnalyzeData import *
from Person import Person
#此内容最终被封装成一个函数,按照周期每隔多长时间调用一次使用。
# 1.基础配置预定义
giUserId = "TU20161027161331711608"
giAppSecert = "46af9da375b007aa20e49b0391bff871b607f2d8e068fac0e58ba9f33ecd749f9e3205761e5e5167cf38170c6c94b7730a026d3d33d40b77a80ef6f2fcd3e84a526f8d38de0c82678d5bfdb2407e089ee17715491e746adfbc6d33998e7fafa658537331366c46e11c8c46b91446bc6d38ff59fa0cbe65b8486fafe14a7f5ab1"
giSyncIssueID = "GuoHai"
giSource = "Q38APDrtEfU="
giBaseUrl = "http://180.153.146.60:7978/"
giOnlyHasCopyright = "false"
skiphour = 2#stime 和etime 查询范围几个小时
# 2.创建连接,通过权限校验,获取token
try:
client = TaikorOauthClient(giUserId,giAppSecert)
except Exception,err:
print 1,err
else:
#3.没报错的情况下,构建要传递的参数parameters
"""
实例化一个timespan
请注意它的参数顺序
timedelta([days[, seconds[, microseconds[, milliseconds[, minutes[, hours[, weeks]]]]]]])
"""
i = 0
step = 1000 #1次读取多少条
etime = int(time.time())
print etime
stime = int(time.mktime((datetime.now() - timedelta(hours=skiphour)).timetuple()))
print stime
parameters = {'Source': giSource,'Stime':stime, 'Etime': etime,'Count':0,'Skip':0}
if(giOnlyHasCopyright.upper() == "TRUE"):
parameters.setdefault("HasCopyright", true)
parameters["Count"] = step
while True:
#开始while
parameters["Skip"] = i * step
resultJson = client.httpGet("Search/FullContents", parameters);
if(not resultJson or resultJson==""): break;
#解析出本次传输的100条数据为100个对象
ItemList=[];
client.handleItemList(resultJson,ItemList);
if(not ItemList or len(ItemList)<=0):
break;
else:
i=i+1;
testkankan=ItemList[0];
print i;
#开始分析性代码
#伪代码: for item in ItemList: 循环分析。。。
#结束分析性代码
#结束while
\ No newline at end of file
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="4.0">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>dabc086c-2eba-4335-a981-588feff3445f</ProjectGuid>
<ProjectHome>.</ProjectHome>
<StartupFile>PythonApplication1.py</StartupFile>
<SearchPath>
</SearchPath>
<WorkingDirectory>.</WorkingDirectory>
<OutputPath>.</OutputPath>
<Name>PythonApplication1</Name>
<RootNamespace>PythonApplication1</RootNamespace>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
<DebugSymbols>true</DebugSymbols>
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)' == 'Release' ">
<DebugSymbols>true</DebugSymbols>
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
</PropertyGroup>
<ItemGroup>
<Compile Include="Model\AnalyzeData.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\AttributeEvaluationResult.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\ItemCountData.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\ItemLabelData.py" />
<Compile Include="Model\ItemReply.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\ItemTracking.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\Structured.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\ResearchReport.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\__init__.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Person.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="PythonApplication1.py" />
<Compile Include="Model\Item.py" />
<Compile Include="taikoropensdk.py" />
<Compile Include="Tool.py">
<SubType>Code</SubType>
</Compile>
</ItemGroup>
<ItemGroup>
<Folder Include="Model\" />
</ItemGroup>
<ItemGroup>
<Content Include="Model\说明.txt" />
</ItemGroup>
<PropertyGroup>
<VisualStudioVersion Condition="'$(VisualStudioVersion)' == ''">10.0</VisualStudioVersion>
<PtvsTargetsFile>$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets</PtvsTargetsFile>
</PropertyGroup>
<Import Condition="Exists($(PtvsTargetsFile))" Project="$(PtvsTargetsFile)" />
<Import Condition="!Exists($(PtvsTargetsFile))" Project="$(MSBuildToolsPath)\Microsoft.Common.targets" />
<!-- Uncomment the CoreCompile target to enable the Build command in
Visual Studio and specify your pre- and post-build commands in
the BeforeBuild and AfterBuild targets below. -->
<!--<Target Name="CoreCompile" />-->
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
</Project>
\ No newline at end of file
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
sys.path.append(r'C:\Users\admin\Documents\Visual Studio 2015\Projects\PythonApplication1\PythonApplication1')
from ObjectModel import ObjectModel
class Tool:
#公开函数:将返回的JSON字符串解析成多个ObjectModel的集合
def get_target_value(self,key, dic, tmp_list):
"""
:param key: 目标key值
:param dic: JSON数据
:param tmp_list: 用于存储获取的数据
:return: list
"""
if not isinstance(dic, dict) or not isinstance(tmp_list, list): # 对传入数据进行格式校验
return 'argv[1] not an dict or argv[-1] not an list '
if key in dic.keys():
tmp_list.append(dic[key]) # 传入数据存在则存入tmp_list
else:
for value in dic.values(): # 传入数据不符合则对其value值进行遍历
if isinstance(value, dict):
self.get_target_value(key, value, tmp_list) # 传入数据的value值是字典,则直接调用自身
elif isinstance(value, (list, tuple)):
self._get_value(key, value, tmp_list) # 传入数据的value值是列表或者元组,则调用_get_value
return tmp_list
#下划线:私有,外部类就别调用了
def _get_value(self,key, val, tmp_list):
for val_ in val:
#k = type(val_)
if isinstance(val_, dict):
self.get_target_value(key, val_, tmp_list) # 传入数据的value值是字典,则调用get_target_value
elif isinstance(val_, (list, tuple)):
self._get_value(key, val_, tmp_list) # 传入数据的value值是列表或者元组,则调用自身
\ No newline at end of file
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
from __future__ import unicode_literals
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
sys.path.append(r'C:\\Users\\admin\\Documents\\Visual Studio 2015\\Projects\\PythonApplication1\\PythonApplication1')
from Tool import Tool
from Model.Item import *
from Model.AnalyzeData import *
from Model.AttributeEvaluationResult import *
from Model.ItemTracking import *
from Model.ItemCountData import *
from Model.ItemReply import *
from Model.ResearchReport import *
from Model.ItemLabelData import *
from Model.Structured import *
#访问Https时不受信任SSL证书问题
ssl._create_default_https_context = ssl._create_unverified_context
class TaikorOauthClient:
def __init__(self, userId, appSecert):
self.userId = userId
self.appSecert = appSecert
self.baseApiUrl = "https://api.palaspom.com/"
self.isAccessTokenSet = False
self.tokenExpiresTime = datetime.min
self.tokenType = ""
self.accessToken = ""
self.tool = Tool()
if (not self.RequestToken()):
raise TypeError("Init the client error, Please check UserId and AppSecert setting. Or you can retry it.")
def IsAuthorized(self):
if(self.isAccessTokenSet and self.accessToken and self.tokenType and self.tokenExpiresTime > (datetime.now() + timedelta(minutes = 10))):
return True
else:
return False
def httpGet(self, api, parameters, needAuthorized=True):
if((not needAuthorized) or self.IsAuthorized() or self.RequestToken()):
if(not api.startswith('http')):
api = self.baseApiUrl + api
if(parameters):
if(api.find('?') < 0):
api = api + "?"
else:
api = api + "&"
for (d,x) in parameters.items():
api = api + d + "=" + str(x) + "&"
api = api.rstrip('&')
req = urllib2.Request(api)
req.add_header('Content-Type', 'application/json')
if(needAuthorized):
req.add_header('Authorization', self.tokenType + ' ' + self.accessToken)
try:
resp = urllib2.urlopen(req)
content = resp.read()
if(content):
result = json.loads(content)
return result
except urllib2.HTTPError as e:
if(e.code == 401 or e.code == 403):
self.RequestToken()
req.add_header('Authorization', self.tokenType + ' ' + self.accessToken)
resp = urllib2.urlopen(req)
content = resp.read()
if(content):
result = json.loads(content,encoding="acsii")
return result
else:
raise TypeError("Error, Has refused to authorize this request.")
def httpPost(self, api, parameters, needAuthorized=True):
if((not needAuthorized) or self.IsAuthorized() or self.RequestToken()):
if(not api.startswith('http')):
api = self.baseApiUrl + api
params = urllib.urlencode(parameters)
req = urllib2.Request(api, params)
req.add_header('Content-Type', 'application/json')
if(needAuthorized):
req.add_header('Authorization', self.tokenType + ' ' + self.accessToken)
try:
resp = urllib2.urlopen(req)
content = resp.read()
if(content):
result = json.loads(content)
return result
except urllib2.HTTPError as e:
if(e.code == 401 or e.code == 403):
self.RequestToken()
req.add_header('Authorization', self.tokenType + ' ' + self.accessToken)
resp = urllib2.urlopen(req)
content = resp.read()
if(content):
result = json.loads(content)
return result
else:
raise TypeError("Error, Has refused to authorize this request.")
def RequestToken(self):
if(not self.userId and not self.appSecert):
raise TypeError("Please set the UserId and AppSecert before you request. You can get this from https://daas.palaspom.com/Login/Authority.")
paramsData = {'userId': self.userId, 'appSecert': self.appSecert}
self.isAccessTokenSet = False
api = "Oauth2/Authorize"
result = self.httpGet(api,paramsData,False)
#打印token,可以注释掉不打印
print(result)
if(result):
if(result and (not result['IsError']) and (not result['IsHttpError']) and result['AccessToken']):
self.tokenExpiresTime = datetime.now() + timedelta(seconds = int(result['ExpiresIn']))
self.accessToken = result['AccessToken']
self.tokenType = result['TokenType']
self.isAccessTokenSet = True
return True
else:
return False
#公开函数:将返回的JSON字符串解析成多个Item的集合ItemList
def handleItemList(self,returnJson,ItemList):
"""
:param returnJson: JSON对象
:param ItemList: 用于存储获取的数据
:param count: 一页多少条(注意:可能不足规定条数)
:return: list
"""
try:
#各种反序列化、索引、解析、unicode编码都试了一遍,太坑了
Temp_Json= returnJson.replace(":null" ,":''").replace(":true" ,":1").replace(":false" ,":0");
Temp_jsonObj= eval(Temp_Json);
count=len(Temp_jsonObj);
num=0;
for num in range(0,count):
if(not Temp_jsonObj[num]):break;
#未实例化之前的对象列表 用TOOL_标注,实际包装时使用不带Tool_的
item = Item();
#region
#第一步:AnalyzeData构造
Tool_ItemAnalyzeDataList=Temp_jsonObj[num]["AnalyzeData"];
#region for Temp_AnalyzeData
for Temp_AnalyzeData in Tool_ItemAnalyzeDataList: #只有1个AnalyzeData
AnalyzeDataObj= AnalyzeData();
AnalyzeDataObj.IssueID=Temp_AnalyzeData["IssueID"];
AnalyzeDataObj.MergeMethod=Temp_AnalyzeData["MergeMethod"];
AnalyzeDataObj.CategoryIDs=Temp_AnalyzeData["CategoryIDs"];
AnalyzeDataObj.DepartmentIDs=Temp_AnalyzeData["DepartmentIDs"];
AnalyzeDataObj.IndustryIDs=Temp_AnalyzeData["IndustryIDs"];
AnalyzeDataObj.Product=Temp_AnalyzeData["Product"];
AnalyzeDataObj.TopicIDs=Temp_AnalyzeData["TopicIDs"];
AnalyzeDataObj.OrganizationIDs=Temp_AnalyzeData["OrganizationIDs"];
AnalyzeDataObj.People=Temp_AnalyzeData["People"];
AnalyzeDataObj.Region=Temp_AnalyzeData["Region"];
AnalyzeDataObj.Keywords=Temp_AnalyzeData["Keywords"];
for Temp_AnalyzeData_Evaluation in Temp_AnalyzeData["Evaluation"]:
Temp_Evaluation= AttributeEvaluationResult(Temp_AnalyzeData_Evaluation["Attribute"],Temp_AnalyzeData_Evaluation["Value"],Temp_AnalyzeData_Evaluation["MatchedWords"]);
AnalyzeDataObj.Evaluation.append(Temp_Evaluation);
AnalyzeDataObj.MarketIDs=Temp_AnalyzeData["MarketIDs"];
AnalyzeDataObj.BlockIDs=Temp_AnalyzeData["BlockIDs"];
AnalyzeDataObj.StockIDs=Temp_AnalyzeData["StockIDs"];
AnalyzeDataObj.RelatedEventID=Temp_AnalyzeData["RelatedEventID"];
AnalyzeDataObj.DuplicationID=Temp_AnalyzeData["DuplicationID"];
AnalyzeDataObj.ProsdMedia=Temp_AnalyzeData["ProsdMedia"];
AnalyzeDataObj.ProsdSplite=Temp_AnalyzeData["ProsdSplite"];
AnalyzeDataObj.ProsdCategory=Temp_AnalyzeData["ProsdCategory"];
AnalyzeDataObj.ProsdDepartment=Temp_AnalyzeData["ProsdDepartment"];
AnalyzeDataObj.ProsdMedia=Temp_AnalyzeData["ProsdMedia"];
AnalyzeDataObj.ProsdIndustry=Temp_AnalyzeData["ProsdIndustry"];
AnalyzeDataObj.ProsdProduct=Temp_AnalyzeData["ProsdProduct"];
AnalyzeDataObj.ProsdPeople=Temp_AnalyzeData["ProsdPeople"];
AnalyzeDataObj.ProsdOrganization=Temp_AnalyzeData["ProsdOrganization"];
AnalyzeDataObj.ProsdRegion=Temp_AnalyzeData["ProsdRegion"];
AnalyzeDataObj.ProsdSentiment=Temp_AnalyzeData["ProsdSentiment"];
AnalyzeDataObj.ProsdSentiment8=Temp_AnalyzeData["ProsdSentiment8"];
AnalyzeDataObj.ProsdSentiment2=Temp_AnalyzeData["ProsdSentiment2"];
AnalyzeDataObj.ProsdKeyword=Temp_AnalyzeData["ProsdKeyword"];
AnalyzeDataObj.prosdIG=Temp_AnalyzeData["prosdIG"];
AnalyzeDataObj.Sentiment=Temp_AnalyzeData["Sentiment"];
AnalyzeDataObj.EmotionHappy=Temp_AnalyzeData["EmotionHappy"];
AnalyzeDataObj.EmotionAccept=Temp_AnalyzeData["EmotionAccept"];
AnalyzeDataObj.EmotionSurprise=Temp_AnalyzeData["EmotionSurprise"];
AnalyzeDataObj.EmotionFear=Temp_AnalyzeData["EmotionFear"];
AnalyzeDataObj.EmotionSad=Temp_AnalyzeData["EmotionSad"];
AnalyzeDataObj.EmotionHate=Temp_AnalyzeData["EmotionHate"];
AnalyzeDataObj.EmotionExpect=Temp_AnalyzeData["EmotionExpect"];
AnalyzeDataObj.EmotionAngry=Temp_AnalyzeData["EmotionAngry"];
AnalyzeDataObj.EmotionPositive=Temp_AnalyzeData["EmotionPositive"];
AnalyzeDataObj.EmotionNegative=Temp_AnalyzeData["EmotionNegative"];
AnalyzeDataObj.IG=Temp_AnalyzeData["IG"];
#第一步:将AnalyzeData加入item中的ItemAnalyzeDataList
item.ItemAnalyzeDataList.append(AnalyzeDataObj);
del AnalyzeDataObj;
#endregion endfor Temp_AnalyzeData
#第一步END:AnalyzeData构造
#第二步:Item构造
#第二步:Item里几个“对象、列表 对象”,单独进行初始化和赋值
item.UpdateTime=Temp_jsonObj[num]["UpdateTime"];
item.IndexTime=Temp_jsonObj[num]["IndexTime"];
item.ContentDetailLevel=Temp_jsonObj[num]["ContentDetailLevel"];
item.ItemID=Temp_jsonObj[num]["ItemID"];
item.ClientItemID=Temp_jsonObj[num]["ClientItemID"];
item.ParentItemID=Temp_jsonObj[num]["ParentItemID"];
item.Url=Temp_jsonObj[num]["Url"];
item.CleanTitle=Temp_jsonObj[num]["CleanTitle"];
item.ShortTitle=Temp_jsonObj[num]["ShortTitle"];
item.CleanText=Temp_jsonObj[num]["CleanText"];
item.Summary=Temp_jsonObj[num]["Summary"];
item.PubDate=Temp_jsonObj[num]["PubDate"];
item.AuthorName=Temp_jsonObj[num]["AuthorName"];
item.AuthorID=Temp_jsonObj[num]["AuthorID"];
item.AuthorTag=Temp_jsonObj[num]["AuthorTag"];
item.Source=Temp_jsonObj[num]["Source"];
item.RetweetID=Temp_jsonObj[num]["RetweetID"];
item.MediaID=Temp_jsonObj[num]["MediaID"];
item.MediaName=Temp_jsonObj[num]["MediaName"];
item.MediaChannel=Temp_jsonObj[num]["MediaChannel"];
item.Rank=Temp_jsonObj[num]["Rank"];
item.MediaType=Temp_jsonObj[num]["MediaType"];
item.MediaRegionType=Temp_jsonObj[num]["MediaRegionType"];
item.MediaWeight=Temp_jsonObj[num]["MediaWeight"];
item.MediaOrganType=Temp_jsonObj[num]["MediaOrganType"];
item.MediaStyle=Temp_jsonObj[num]["MediaStyle"];
item.MediaTendency=Temp_jsonObj[num]["MediaTendency"];
item.MediaIndustryIDs=Temp_jsonObj[num]["MediaIndustryIDs"];
item.MediaTag=Temp_jsonObj[num]["MediaTag"];
item.ProxyZone=Temp_jsonObj[num]["ProxyZone"];
item.ReproducedMediaID=Temp_jsonObj[num]["ReproducedMediaID"];
item.ReproducedMediaName=Temp_jsonObj[num]["ReproducedMediaName"];
item.ReproducedUrl=Temp_jsonObj[num]["ReproducedUrl"];
item.MediaCopyright=Temp_jsonObj[num]["MediaCopyright"];
#对象ItemTracking
Temp_Tracking=Temp_jsonObj[num]["Tracking"];
if(not Temp_Tracking or Temp_Tracking==""):
item.Tracking="";
else:
item.Tracking= ItemTracking(Temp_Tracking["FollowStatus"],
Temp_Tracking["FollowNextTime"],
Temp_Tracking["FollowPriority"],
Temp_Tracking["LastReplyUrl"],
Temp_Tracking["LastReplyPageNo"],
Temp_Tracking["LastReply_SinceID"],
Temp_Tracking["LastReplyDate"],
Temp_Tracking["LastReplyAuthorID"],
Temp_Tracking["LastReplyAuthorName"],
Temp_Tracking["ReplyCount,FollowCount"],
Temp_Tracking["FollowErrorCount"]);
#对象ItemTracking
Temp_Tracking_Forward=Temp_jsonObj[num]["Tracking_Forward"];
if(not Temp_Tracking_Forward or Temp_Tracking_Forward==""):
item.Tracking_Forward="";
else:
item.Tracking_Forward=ItemTracking( Temp_Tracking_Forward["FollowStatus"],
Temp_Tracking_Forward["FollowNextTime"],
Temp_Tracking_Forward["FollowPriority"],
Temp_Tracking_Forward["LastReplyUrl"],
Temp_Tracking_Forward["LastReplyPageNo"],
Temp_Tracking_Forward["LastReply_SinceID"],
Temp_Tracking_Forward["LastReplyDate"],
Temp_Tracking_Forward["LastReplyAuthorID"],
Temp_Tracking_Forward["LastReplyAuthorName"],
Temp_Tracking_Forward["ReplyCount,FollowCount"],
Temp_Tracking_Forward["FollowErrorCount"]);
#对象ItemCountData
Temp_CurrentCount=Temp_jsonObj[num]["CurrentCount"];
if(not Temp_CurrentCount or Temp_CurrentCount==""):
item.CurrentCount="";
else:
item.CurrentCount=ItemCountData(Temp_CurrentCount["ViewCount"],
Temp_CurrentCount["ReplyCount"],
Temp_CurrentCount["ForwardCount"],
Temp_CurrentCount["FetchTime"]);
#列表 对象ItemCountData[]
for Temp_CountHistory in Temp_jsonObj[num]["CountHistory"]:
T_CountHistory=ItemCountData(Temp_CountHistory["ViewCount"],
Temp_CountHistory["ReplyCount"],
Temp_CountHistory["ForwardCount"],
Temp_CountHistory["FetchTime"]);
item.CountHistory.append(T_CountHistory);
#列表 对象ItemReply[]
for Temp_Reply in Temp_jsonObj[num]["Reply"]:
T_Reply=ItemReply( Temp_Reply["ItemID"],
Temp_Reply["CleanText"],
Temp_Reply["FetchTime"],
Temp_Reply["PubDate"],
Temp_Reply["AuthorName"],
Temp_Reply["AuthorID"],
Temp_Reply["Location"],
Temp_Reply["Source"],
Temp_Reply["Mark"]);
item.Reply.append(T_Reply);
item.DuplicationID=Temp_jsonObj[num]["DuplicationID"];
item.SpliteTitle=Temp_jsonObj[num]["SpliteTitle"];
item.SpliteText=Temp_jsonObj[num]["SpliteText"];
item.ProsdDuplication=Temp_jsonObj[num]["ProsdDuplication"];
item.Tag=Temp_jsonObj[num]["Tag"];
#对象ResearchReport
Temp_ResearchReport=Temp_jsonObj[num]["ResearchReport"];
if(not Temp_ResearchReport or Temp_ResearchReport==""):
item.ResearchReport="";
else:
item.ResearchReport=ResearchReport(Temp_ResearchReport["ReportType"],Temp_ResearchReport["ReportAgency"]);
#列表对象ItemLabelData[]
for Temp_LabelData in Temp_jsonObj[num]["LabelData"]:
T_LabelData=ItemLabelData(Temp_LabelData["IssueID"]);
item.LabelData.append(T_LabelData);
item.IsOriginalDeleted=Temp_jsonObj[num]["IsOriginalDeleted"];
item.IsEPRPublished=Temp_jsonObj[num]["IsEPRPublished"];
#对象Structured
Temp_Struct=Temp_jsonObj[num]["Struct"];
if(not Temp_Struct or Temp_Struct==""):
item.Struct="";
else:
item.Struct=Structured( Temp_Struct["StructId"],
Temp_Struct["Category"],
Temp_Struct["Subject"],
Temp_Struct["Keypoint"],
Temp_Struct["MarketImpact"]);
#第二步END:Item构造
#endregion
#第二步:Item放入ItemList
ItemList.append(item);
#测试:能否取出来
#v=item.CleanTitle;
#if not (not item.Struct or item.Struct==""):
# t=item.Struct.StructId;
#p=item.ItemAnalyzeDataList[0];
#k=p.IssueID
#测试END:能否取出来
del item;
#第二步END:Item放入ItemList
else:# 循环正常执行完
print num, '解析完成1次'
return ItemList;
except Exception,err:
print err
\ No newline at end of file
[
[
{
"UpdateTime": "0001-01-01T00:00:00",
"IndexTime": "0001-01-01T00:00:00",
"ContentDetailLevel": "内容详细程度",
"ItemID": "文章ID",
"ClientItemID": "客户系统或对方网站的ItemID",
"ParentItemID": "父级ItemID",
"Url": "文章Url",
"CleanTitle": "标题",
"ShortTitle": "短标题",
"CleanText": "正文",
"Summary": "高达81%的受访企业表示在未来三年有明确增员计划,该比例较上一年大幅上涨24个百分点,其中预计人员增长10%以上企业比例超过1/3\r\n扩张中的企业出于战略布局的考虑,往往会在不同城市设立分部甚至变更总部所在地",
"PubDate": "页面显示发布时间",
"AuthorName": "作者名(分词索引)",
"AuthorID": "作者ID",
"AuthorTag": "Author.Tag字段,用于区分人",
"Source": "显示来源(比如使用终端等)",
"RetweetID": "转载的ID,如果有的话",
"MediaID": "媒体编号ID",
"MediaName": "21世纪经济报道",
"MediaChannel": "政经",
"Rank": " 在媒体中的排序,用以表示顺序或者重要性(例如头版、首页之类)",
"MediaType":"媒体类型",
"MediaRegionType":"媒体区域类型",
"MediaWeight": " 媒体权重",
"MediaOrganType": "媒体性质",
"MediaStyle": "媒体样式",
"MediaTendency": "媒体倾向性",
"MediaIndustryIDs": "媒体行业ID",
"MediaTag": "媒体标签",
"ProxyZone": "代理分区 >0表示要翻墙",
"ReproducedMediaID": "页面显示原始媒体名称ID",
"ReproducedMediaName": "页面显示原始媒体名称",
"ReproducedUrl": "页面显示原始媒体的Url",
"MediaCopyright": "该媒体版权;0 未知,11 黑名单,12 无版权,13 不允许转载,14 拒绝抓取,20 白名单,21 采购可转授,22 自编,23 开放可商业转载,31 采购需单独授权,32 用户自有版权,33 用户自行承担版权,41 非商业使用",
"Tracking": "跟踪设置和信息(评论)",
"Tracking_Forward": "跟踪设置和信息(转发)",
"CurrentCount": "最新的点击数、评论数、转发数",
"CountHistory": " 定时获取的点击数、评论数、转发数(存ES)",
"Reply": "回复数据",
"DuplicationID": "相似转载ID(早先入库的相似文章ID,第一篇该字段为空,其他均为第一篇的ID)",
"SpliteTitle": "分词结果",
"SpliteText": " 分词结果",
"ProsdDuplication": "是否进行过转载分析",
"Tag": "通用标签,根据各实体库中的Tag字段设置",
"ResearchReport": "研报",
"AnalyzeData语义分析结果字段": [
{
"IssueID": "议题编号",
"MergeMethod": "归并入Issue的方式",
"CategoryIDs": " 用户分类ID(多个)",
"DepartmentIDs": " 用户部门ID(多个)",
"IndustryIDs": "行业ID(多个)",
"Product": " 产品名称(多个,<品牌-系列-品名-型号-类型>路径,用空格或,;隔开,分词索引)",
"TopicIDs": "用户话题ID(多个)",
"OrganizationIDs": "组织机构ID(多个)",
"People": "用户人物名称(多个,分词索引)",
"Region": "地区名称(多个,<省-市-区县-街道村镇>路径,用空格或,;隔开,分词索引)",
"Keywords": "关键词提取",
"Evaluation": "属性评分的结果",
"MarketIDs": " 品类/市场编号",
"BlockIDs": "主题/板块编号",
"StockIDs": "股票编号",
"RelatedEventID": "相关事件编号",
"DuplicationID": "转载编号",
"以下几个标志":"已分析过的标志",
"ProsdMedia": false,
"ProsdSplite": false,
"ProsdTopic": true,
"ProsdCategory": true,
"ProsdDepartment": false,
"ProsdIndustry": true,
"ProsdProduct": true,
"ProsdPeople": false,
"ProsdOrganization": true,
"ProsdRegion": true,
"ProsdSentiment": true,
"ProsdSentiment8": false,
"ProsdSentiment2": false,
"ProsdKeyword": false,
"prosdIG": false,
"Sentiment": "情感分析:正负情感(1 ~ 3)1:正面;2:中性;3:负面",
"EmotionHappy": "高兴",
"EmotionAccept": 0,
"EmotionSurprise": 0,
"EmotionFear": 0,
"EmotionSad": 0,
"EmotionHate": 0,
"EmotionExpect": 0,
"EmotionAngry": 0,
"EmotionPositive": 0,
"EmotionNegative": 0,
"IG": null
}
],
"LabelData": " 语料标注字段",
"IsOriginalDeleted": "原文是否已被删除",
"IsEPRPublished": "是否是epr公司发布的软广",
"Struct": "文本结构化信息"
}
]
This source diff could not be displayed because it is too large. You can view the blob instead.
/// <summary>
/// <summary>
/// 语义分析结果(每Issue可不同)
/// </summary>
[Serializable]
public class ItemAnalyzeData
{
/// <summary>
/// 议题编号
/// </summary>
public string IssueID { get; set; }
/// <summary>
/// 归并入Issue的方式
/// </summary>
public Enums.MergeMethod MergeMethod { get; set; }
/// <summary>
/// 用户分类ID(多个)
/// </summary>
public string[] CategoryIDs { get; set; }
/// <summary>
/// 用户部门ID(多个)
/// </summary>
public string[] DepartmentIDs { get; set; }
/// <summary>
/// 行业ID(多个)
/// </summary>
public string[] IndustryIDs { get; set; }
/// <summary>
/// 产品名称(多个,"品牌-系列-品名-型号-类型"路径,用空格或,;隔开,分词索引)
/// 其中类型为新加,特殊字段,不和前四个形成继承关系
/// </summary>
public string[] Product { get; set; }
/// <summary>
/// 用户话题ID(多个)
/// </summary>
public string[] TopicIDs { get; set; }
/// <summary>
/// 组织机构ID(多个)
/// </summary>
public string[] OrganizationIDs { get; set; }
/// <summary>
/// 用户人物名称(多个,分词索引)
/// </summary>
public string[] People { get; set; }
/// <summary>
/// 地区名称(多个,"省-市-区县-街道村镇"路径,用空格或,;隔开,分词索引)
/// </summary>
public string[] Region { get; set; }
/// <summary>
/// 关键词提取
/// </summary>
public string[] Keywords { get; set; }
/// <summary>
/// 属性评分的结果
/// </summary>
public AttributeEvaluationResult[] Evaluation { get; set; }
/// <summary>
/// 品类/市场编号
/// </summary>
public string[] MarketIDs { set; get; }
/// <summary>
/// 主题/板块编号
/// </summary>
public string[] BlockIDs { set; get; }
/// <summary>
/// 股票编号
/// </summary>
public string[] StockIDs { set; get; }
/// <summary>
/// 相关事件编号
/// </summary>
public string RelatedEventID { set; get; }
/// <summary>
/// 转载编号
/// </summary>
public string DuplicationID { set; get; }
/// <summary>
/// 已分析过的标志
/// </summary>
public bool ProsdMedia { get; set; }
public bool ProsdSplite { get; set; }
public bool ProsdTopic { get; set; }
public bool ProsdCategory { get; set; }
public bool ProsdDepartment { get; set; }
public bool ProsdIndustry { get; set; }
public bool ProsdProduct { get; set; }
public bool ProsdPeople { get; set; }
public bool ProsdOrganization { get; set; }
public bool ProsdRegion { get; set; }
public bool ProsdSentiment { get; set; }
public bool ProsdSentiment8 { get; set; }
public bool ProsdSentiment2 { get; set; }
public bool ProsdKeyword { get; set; }
public bool prosdIG { get; set; }
#region 情感分析
/// <summary>
/// 正负情感(1 ~ 3),1:正面;2:中性;3:负面
/// </summary>
public decimal Sentiment { get; set; }
/// <summary>
/// 高兴
/// </summary>
public decimal EmotionHappy { get; set; }
/// <summary>
/// EmotionAccept
/// </summary>
public decimal EmotionAccept { get; set; }
/// <summary>
/// EmotionSurprise
/// </summary>
public decimal EmotionSurprise { get; set; }
/// <summary>
/// EmotionFear
/// </summary>
public decimal EmotionFear { get; set; }
/// <summary>
/// EmotionSad
/// </summary>
public decimal EmotionSad { get; set; }
/// <summary>
/// EmotionHate
/// </summary>
public decimal EmotionHate { get; set; }
/// <summary>
/// EmotionExpect
/// </summary>
public decimal EmotionExpect { get; set; }
/// <summary>
/// EmotionAngry
/// </summary>
public decimal EmotionAngry { get; set; }
/// <summary>
/// EmotionPositive
/// </summary>
public decimal EmotionPositive { get; set; }
/// <summary>
/// EmotionNegative
/// </summary>
public decimal EmotionNegative { get; set; }
/// <summary>
/// IG Tree
/// </summary>
public string IG { set; get; }
#endregion 情感分析
public ItemAnalyzeData()
{
}
public ItemAnalyzeData(string IssueID, Enums.MergeMethod MergeMethod)
{
this.IssueID = IssueID;
this.MergeMethod = MergeMethod;
}
}
/// <summary>
/// <summary>
/// 普通爬虫任务数据包
/// </summary>
[Serializable]
[DataContract]
public class CrawlRecode
{
/// <summary>
/// 爬虫名称
/// </summary>
[DataMember]
public string CrawlerName { get; set; }
#region Crawl包含字段
/// <summary>
/// Crawl ID
/// </summary>
[DataMember]
public string CrawlID { get; set; }
/// <summary>
/// 最后一次入库的ItemID
/// </summary>
[DataMember]
public string LastItemID { get; set; }
/// <summary>
/// 最后一次Item发布时间
/// </summary>
[DataMember]
public DateTime? LastItemPubDate { get; set; }
/// <summary>
/// URL匹配正则
/// </summary>
[DataMember]
public string PatternFetchUrl { get; set; }
/// <summary>
/// Crawl所属IssueID
/// </summary>
[DataMember]
public string IssueID { get; set; }
/// <summary>
/// 抓取的Url
/// </summary>
[DataMember]
public string Url { get; set; }
/// <summary>
/// 爬虫分组,主要用于辨别爬虫类型
/// </summary>
[DataMember]
public string CrawlerGroup { get; set; }
/// <summary>
/// 分析标签,标注哪些需要分析
/// </summary>
[DataMember]
public int AnalyzeFlag { get; set; }
/// <summary>
/// 抓取的数据需要加入到的IssueID
/// </summary>
[DataMember]
public string JoinIssueIDs { get; set; }
/// <summary>
/// 抓取数量,没有则默认20页
/// </summary>
[DataMember]
public int RequiredCount { get; set; }
/// <summary>
/// 地区
/// </summary>
[DataMember]
public string RegionIDs { get; set; }
/// <summary>
/// 分类
/// </summary>
[DataMember]
public string CategoryIDs { get; set; }
/// <summary>
/// 部门
/// </summary>
[DataMember]
public string DepartmentIDs { get; set; }
/// <summary>
/// 爬虫涉及行业
/// </summary>
[DataMember]
public string CIndustryIDs { get; set; }
/// <summary>
/// 产品
/// </summary>
[DataMember]
public string ProductIDs { get; set; }
/// <summary>
/// 人物
/// </summary>
[DataMember]
public string PeopleIDs { get; set; }
/// <summary>
/// 组织机构
/// </summary>
[DataMember]
public string OrganizationIDs { get; set; }
/// <summary>
/// 标签,多个使用英文逗号分隔
/// </summary>
[DataMember]
public string Tags { get; set; }
#endregion
#region Site包含字段
/// <summary>
/// 站点ID
/// </summary>
[DataMember]
public string SiteID { get; set; }
/// <summary>
/// 浏览器类型
/// </summary>
[DataMember]
public sbyte BrowserType { get; set; }
/// <summary>
/// 安全请求间隔毫秒数
/// </summary>
[DataMember]
public int IntervalMSBtwReqs { get; set; }
/// <summary>
/// 网页编码类型
/// </summary>
[DataMember]
public string Encoding { get; set; }
/// <summary>
/// 失败后的重试次数
/// </summary>
[DataMember]
public sbyte FailRetry { get; set; }
/// <summary>
/// 抽取模式
/// </summary>
[DataMember]
public sbyte ParseMethod { get; set; }
/// <summary>
/// 抽取列表页的Pattern
/// </summary>
[DataMember]
public string ListPattern { get; set; }
/// <summary>
/// 取相关内容
/// </summary>
[DataMember]
public string ListPatternNest { get; set; }
/// <summary>
/// 相关内容模式
/// </summary>
[DataMember]
public sbyte ListPatternNestMethod { get; set; }
/// <summary>
/// 多线程限制
/// </summary>
[DataMember]
public int ParallelLimit { get; set; }
/// <summary>
/// 下一页的UrlPattern
/// </summary>
[DataMember]
public string ListPatternNextPageUrl { get; set; }
/// <summary>
/// 是否区分大小写
/// </summary>
[DataMember]
public bool UrlCaseSensitive { get; set; }
[DataMember]
public sbyte ContentDetailLevel { get; set; }
#endregion
#region Media包含字段
/// <summary>
/// 媒体ID
/// </summary>
[DataMember]
public string MediaID
{ get; set; }
/// <summary>
/// 媒体名称
/// </summary>
[DataMember]
public string MediaName { get; set; }
/// <summary>
/// 频道名称
/// </summary>
[DataMember]
public string Channel { get; set; }
/// <summary>
/// 媒体类型
/// </summary>
[DataMember]
public sbyte MediaType { get; set; }
/// <summary>
/// 媒体区域类型
/// </summary>
[DataMember]
public sbyte RegionType { get; set; }
/// <summary>
/// 媒体权重
/// </summary>
[DataMember]
public sbyte MediaWeight { get; set; }
/// <summary>
/// 媒体性质
/// </summary>
[DataMember]
public sbyte MediaOrganType { get; set; }
/// <summary>
/// 媒体样式
/// </summary>
[DataMember]
public sbyte MediaStyle { get; set; }
/// <summary>
/// 媒体倾向性
/// </summary>
[DataMember]
public sbyte MediaTendency { get; set; }
/// <summary>
/// 媒体行业ID
/// </summary>
[DataMember]
public string IndustryIDs { get; set; }
/// <summary>
/// 媒体标签
/// </summary>
[DataMember]
public string MediaTag { get; set; }
/// <summary>
/// 媒体版权
/// </summary>
public int MediaCopyright { get; set; }
#endregion
}
/// <summary>
/// 全局公用的单Item数据包
/// </summary>
[ElasticsearchType(Name = "items")]
public class Item
{
#region 抓取任务数据
/// <summary>
/// 抓取任务ID(只保留第一个抓取任务)
/// </summary>
/// <remarks>如果另一个Crawl抓取到此Item,会根据后者的设置修改Item属性,但不修改CrawlID</remarks>
public string CrawlID { get; set; }
/// <summary>
/// 完成抓取的爬虫(只保留第一个)
/// </summary>
public string Crawler { get; set; }
/// <summary>
/// 抓取时间(第一次,如更新则修改UpdateTime)
/// </summary>
[BsonDateTimeOptions(Kind = DateTimeKind.Local)]
public DateTime FetchTime { get; set; }
/// <summary>
/// 该条目信息本系统最后更新的时间
/// </summary>
[BsonDateTimeOptions(Kind = DateTimeKind.Local)]
public DateTime UpdateTime { get; set; }
/// <summary>
/// 该条目信息的存储时间,用于分析分析效率与计算抓取质量
/// </summary>
[BsonDateTimeOptions(Kind = DateTimeKind.Local)]
public DateTime IndexTime { get; set; }
///<summary>
/// 内容详细程度
///</summary>
public Enums.ContentDetailLevel ContentDetailLevel { get; set; }
#endregion 抓取任务数据
#region 基础数据
/// <summary>
/// 文章ID
/// </summary>
public string ItemID { get; set; }
/// <summary>
/// 客户系统或对方网站的ItemID
/// </summary>
public string ClientItemID { get; set; }
/// <summary>
/// 父级ItemID
/// </summary>
public string ParentItemID { get; set; }
/// <summary>
/// 文章Url
/// </summary>
public string Url { get; set; }
/// <summary>
/// 标题
/// </summary>
public string CleanTitle { get; set; }
/// <summary>
/// 短标题
/// </summary>
public string ShortTitle { get; set; }
/// <summary>
/// 正文
/// </summary>
public string CleanText { get; set; }
/// <summary>
/// HTML正文(如需要)
/// </summary>
public string HTMLText { get; set; }
/// <summary>
/// 摘要
/// </summary>
public string Summary { get; set; }
/// <summary>
/// 页面显示发布时间
/// </summary>
[BsonDateTimeOptions(Kind = DateTimeKind.Local)]
public DateTime PubDate { get; set; }
/// <summary>
/// PostIP
/// </summary>
public Int32 IP { get; set; }
/// <summary>
/// 作者或内容的地点属性
/// </summary>
public string Location { get; set; }
/// <summary>
/// Location集合的ID,外部的(如点评)设为Url
/// </summary>
public string PoID { get; set; }
/// <summary>
/// LocationID的来源,_前缀表示url未解开,Location集合中没有
/// </summary>
public string PoIDSource { get; set; }
/// <summary>
/// 接近某CBD的PoID
/// </summary>
public string NearCBD_PoID { get; set; }
/// <summary>
/// 纬度
/// </summary>
public float Lat { get; set; }
/// <summary>
/// 经度
/// </summary>
public float Lon { get; set; }
/// <summary>
/// 作者名(分词索引)
/// </summary>
public string AuthorName { get; set; }
/// <summary>
/// 作者ID
/// </summary>
public string AuthorID { get; set; }
/// <summary>
/// 作者图片
/// </summary>
public string AuthorImg { get; set; }
/// <summary>
/// 实名认证状态
/// </summary>
public Enums.CertificatedType AuthorCertificated { get; set; }
/// <summary>
/// Author.Tag字段,用于区分人
/// </summary>
public string AuthorTag { get; set; }
/// <summary>
/// 显示来源(比如使用终端等)
/// </summary>
public string Source { get; set; }
/// <summary>
/// 附带图片
/// </summary>
public string AttachImg { get; set; }
/// <summary>
/// 附带的外部链接
/// </summary>
public string AttachUrl { get; set; }
/// <summary>
/// 附件
/// </summary>
public string AttachFile { get; set; }
#endregion
//转载的ID,如果有的话
public string RetweetID { get; set; }
#region 媒体信息
/// <summary>
/// 媒体编号ID
/// </summary>
public string MediaID { get; set; }
/// <summary>
/// 媒体名称(分词索引)
/// </summary>
public string MediaName { get; set; }
/// <summary>
/// 频道/版面
/// </summary>
public string MediaChannel { get; set; }
/// <summary>
/// 在媒体中的排序,用以表示顺序或者重要性(例如头版、首页之类)
/// </summary>
public int Rank { get; set; }
/// <summary>
/// 媒体类型(网站|论坛|博客……)
/// </summary>
public Enums.MediaType MediaType { get; set; }
/// <summary>
/// 媒体发行范围(全国|地方|海外)
/// </summary>
public Enums.MediaRegionType MediaRegionType { get; set; }
/// <summary>
/// 媒体权重,主流媒体等
/// </summary>
public SByte MediaWeight { get; set; }
/// <summary>
/// 媒体性质(党报|市场化)
/// </summary>
public Enums.MediaOrganType MediaOrganType { get; set; }
/// <summary>
/// 语体类型(新闻|评论|转发……)
/// </summary>
public Enums.MediaStyle MediaStyle { get; set; }
/// <summary>
/// 政治倾向性(-3反动 -2极左 -1左倾 0中立 1右倾 2极右 3极端右)
/// </summary>
public SByte MediaTendency { get; set; }
/// <summary>
/// 媒体行业属性
/// </summary>
public string MediaIndustryIDs { get; set; }
/// <summary>
/// Media.MediaTag用于区分媒体集合
/// </summary>
public string MediaTag { get; set; }
/// <summary>
/// 代理分区 >0表示要翻墙
/// </summary>
public SByte ProxyZone { get; set; }
/// <summary>
/// 原始媒体ID(如果在媒体库中找到匹配)
/// </summary>
public string ReproducedMediaID { get; set; }
/// <summary>
/// 原始媒体名称
/// </summary>
public string ReproducedMediaName { get; set; }
/// <summary>
/// 原始媒体的Url
/// </summary>
public string ReproducedUrl { get; set; }
/// <summary>
/// 该媒体版权;0 未知,11 黑名单,12 无版权,13 不允许转载,14 拒绝抓取,20 白名单,21 采购可转授,22 自编,23 开放可商业转载,31 采购需单独授权,32 用户自有版权,33 用户自行承担版权,41 非商业使用
/// </summary>
public short MediaCopyright { get; set; }
#endregion 媒体信息
#region Item跟踪
/// <summary>
/// 跟踪设置和信息(评论)
/// </summary>
public ItemTracking Tracking { get; set; }
/// <summary>
/// 跟踪设置和信息(转发)
/// </summary>
public ItemTracking Tracking_Forward { get; set; }
/// <summary>
/// 最新的点击数、评论数、转发数
/// </summary>
public ItemCountData CurrentCount { get; set; }
/// <summary>
/// 定时获取的点击数、评论数、转发数(存ES)
/// </summary>
public ItemCountData[] CountHistory { get; set; }
/// <summary>
/// 回复数据
/// </summary>
public ItemReply[] Reply { get; set; }
#endregion Item跟踪
#region 扩展数据
/// <summary>
/// 相似转载ID(早先入库的相似文章ID,第一篇该字段为空,其他均为第一篇的ID)
/// </summary>
public string DuplicationID { get; set; }
/// <summary>
/// 分词结果
/// </summary>
public string SpliteTitle { get; set; }
/// <summary>
/// 分词结果
/// </summary>
public string SpliteText { get; set; }
/// <summary>
/// 是否进行过转载分析
/// </summary>
public bool ProsdDuplication { get; set; }
/// <summary>
/// 通用标签,根据各实体库中的Tag字段设置
/// </summary>
public string Tag { get; set; }
/// <summary>
/// 研报
/// </summary>
public ResearchReport ResearchReport { get; set; }
///<summary>
///分析字段,每个客户一条记录(支持不同的分析模型)
///</summary>
[Nested(IncludeInParent = true)]//(Path = "analyzeData")]
public ItemAnalyzeData[] AnalyzeData { get; set; }
/// <summary>
/// 管理字段,每个客户一条记录(支持不同的应用模型)
/// </summary>
[Nested(IncludeInParent = true)]//(Path = "manageData")]
public ItemManageData[] ManageData { get; set; }
/// <summary>
/// 发布字段,每个客户一条记录(支持不同的发布网站)
/// </summary>
[Nested(IncludeInParent = true)]//(Path = "publishData")]
public ItemPublishData[] PublishData { get; set; }
/// <summary>
/// 语料标注字段,每个客户一条记录(支持不同的标注标准)
/// </summary>
[Nested(IncludeInParent = true)]//(Path = "labelData")]
public ItemLabelData[] LabelData { get; set; }
/// <summary>
/// 提及(@)的用户(不含转发的@)
/// </summary>
public string[] ReferredAuthorIDs { get; set; }
///<summary>
/// 原文是否已被删除
///</summary>
public bool IsOriginalDeleted { get; set; }
/// <summary>
/// 是否是epr公司发布的软广
/// </summary>
public bool IsEPRPublished { get; set; }
#endregion
#region 文本结构化信息
/// <summary>
/// 文本结构化信息
/// </summary>
public Structured Struct { get; set; }
#endregion
#region Debug
public override string ToString()
{
return string.Format("[ItemID:{0} Url:{1}]", ItemID, Url);
}
#endregion
#region 智又盈扩展 废弃
//public ZyyicItem ZyyicItem { get; set; }
#endregion
#region 扩展方法
public string GetDisplayTitle(string issueID)
{
if (ManageData == null)
return CleanTitle;
var corrData = ManageData.FirstOrDefault(data => data.IssueID == issueID);
if (corrData == null)
{
return CleanTitle;
}
else
{
if (string.IsNullOrEmpty(corrData.EditorTitle))
return CleanTitle;
else
return corrData.EditorTitle;
}
}
public string GetDisplayText(string issueID)
{
if (ManageData == null)
return HTMLText;
var corrData = ManageData.FirstOrDefault(data => data.IssueID == issueID);
if (corrData == null)
{
return HTMLText;
}
else
{
if (string.IsNullOrEmpty(corrData.EditorText))
return HTMLText;
else
return corrData.EditorText;
}
}
public string GetDisplaySummary(string issueID)
{
if (ManageData == null)
return "";
var corrData = ManageData.FirstOrDefault(data => data.IssueID == issueID && data.ZyyicItemData!=null);
if (corrData == null)
{
return "";
}
else
{
return corrData.ZyyicItemData.Summary;
}
}
public ItemManageData GetItemManageData(string issueID)
{
if (ManageData == null)
return null;
var corrData = ManageData.FirstOrDefault(data => data.IssueID == issueID);
if (corrData == null)
{
return null;
}
else
{
return corrData;
}
}
public ItemAnalyzeData GetItemAnalyzeData(string issueID)
{
if (AnalyzeData == null)
return null;
var corrData = AnalyzeData.FirstOrDefault(data => data.IssueID == issueID);
if (corrData == null)
{
return null;
}
else
{
return corrData;
}
}
public void SetDisplayTitle(string issueID, string title)
{
if (ManageData == null)
ManageData = new ItemManageData[] { new ItemManageData(issueID) };
var corrData = ManageData.FirstOrDefault(data => data.IssueID == issueID);
if (corrData == null)
{
var manageList = ManageData.ToList();
manageList.Add(new ItemManageData(issueID) { EditorTitle = title });
ManageData = manageList.ToArray();
}
else
{
corrData.EditorTitle = title;
}
if (string.IsNullOrEmpty(title))
CleanTitle=title;
}
public void SetDisplaySummary(string issueID, string summary)
{
if (ManageData == null)
ManageData = new ItemManageData[] { new ItemManageData(issueID) };
var corrData = ManageData.FirstOrDefault(data => data.IssueID == issueID);
if (corrData == null)
{
var manageList = ManageData.ToList();
manageList.Add(new ItemManageData(issueID) { ZyyicItemData= new ZyyicItemData(){ Summary=summary} });
ManageData = manageList.ToArray();
}
else
{
if (corrData.ZyyicItemData != null)
corrData.ZyyicItemData.Summary = summary;
else
corrData.ZyyicItemData = new ZyyicItemData() { Summary = summary };
}
}
public void SetDisplayText(string issueID, string text)
{
if (ManageData == null)
ManageData = new ItemManageData[] { new ItemManageData(issueID) };
var corrData = ManageData.FirstOrDefault(data => data.IssueID == issueID);
if (corrData == null)
{
var manageList = ManageData.ToList();
manageList.Add(new ItemManageData(issueID) { EditorText = text });
ManageData = manageList.ToArray();
}
else
{
corrData.EditorText = text;
}
if (string.IsNullOrEmpty(HTMLText))
HTMLText = text;
}
/// <summary>
/// 设置删除标志
/// </summary>
/// <param name="issueID"></param>
/// <param name="IsDeleted"></param>
public void SetItemDeleted(string issueID, bool IsDeleted)
{
if (ManageData == null)
ManageData = new ItemManageData[] { new ItemManageData(issueID) };
var corrData = ManageData.FirstOrDefault(data => data.IssueID == issueID);
if (corrData == null)
{
var manageList = ManageData.ToList();
manageList.Add(new ItemManageData(issueID) { IsDeleted = IsDeleted });
ManageData = manageList.ToArray();
}
else
{
corrData.IsDeleted = IsDeleted;
}
}
#endregion
/// <summary>
/// 设置审核标志
/// </summary>
/// <param name="issueID"></param>
/// <param name="IsDeleted"></param>
public void SetItemStatus(string issueID,Enums.ZyyicEnums.Status status)
{
if (ManageData == null)
ManageData = new ItemManageData[] { new ItemManageData(issueID) };
var corrData = ManageData.FirstOrDefault(data => data.IssueID == issueID);
if (corrData == null)
{
var manageList = ManageData.ToList();
manageList.Add(new ItemManageData(issueID) { HandleStatus = (sbyte)status });
ManageData = manageList.ToArray();
}
else
{
corrData.HandleStatus = (sbyte)status;
}
}
}
/// <summary>
/// 抓取下来的原始数据(供Crawler传递给后端)
/// </summary>
[Serializable]
public class RawItem
{
#region 抓取任务数据
/// <summary>
/// 抓取任务ID(只保留第一个抓取任务)
/// </summary>
/// <remarks>如果另一个Crawl抓取到此Item,会根据后者的设置修改Item属性,但不修改CrawlID</remarks>
public string CrawlID { get; set; }
/// <summary>
/// 完成抓取的爬虫(只保留第一个)
/// </summary>
public string Crawler { get; set; }
/// <summary>
/// 抓取时间(第一次,如更新则修改UpdateTime)
/// </summary>
[BsonDateTimeOptions(Kind = DateTimeKind.Local)]
public DateTime FetchTime { get; set; }
///<summary>
/// 内容详细程度
///</summary>
public Enums.ContentDetailLevel ContentDetailLevel { get; set; }
#endregion 抓取任务数据
#region 基础数据
/// <summary>
/// 文章ID
/// </summary>
public string ItemID { get; set; }
/// <summary>
/// 客户系统或对方网站的ItemID
/// </summary>
public string ClientItemID { get; set; }
/// <summary>
/// 父级ItemID
/// </summary>
public string ParentItemID { get; set; }
/// <summary>
/// 文章Url
/// </summary>
public string Url { get; set; }
/// <summary>
/// 标题
/// </summary>
public string CleanTitle { get; set; }
/// <summary>
/// 正文
/// </summary>
public string CleanText { get; set; }
/// <summary>
/// HTML正文(如需要)
/// </summary>
public string HTMLText { get; set; }
/// <summary>
/// 页面显示发布时间
/// </summary>
[BsonDateTimeOptions(Kind = DateTimeKind.Local)]
public DateTime PubDate { get; set; }
/// <summary>
/// IP
/// </summary>
public Int32 IP { get; set; }
/// <summary>
/// 作者或内容的地点属性
/// </summary>
public string Location { get; set; }
/// <summary>
/// Location集合的ID,外部的(如点评)设为Url
/// </summary>
public string PoID { get; set; }
/// <summary>
/// LocationID的来源,_前缀表示url未解开,Location集合中没有
/// </summary>
public string PoIDSource { get; set; }
/// <summary>
/// 纬度
/// </summary>
public float Lat { get; set; }
/// <summary>
/// 经度
/// </summary>
public float Lon { get; set; }
/// <summary>
/// 作者名(分词索引)
/// </summary>
public string AuthorName { get; set; }
/// <summary>
/// 作者ID
/// </summary>
public string AuthorID { get; set; }
/// <summary>
/// 作者图片
/// </summary>
public string AuthorImg { get; set; }
/// <summary>
/// 实名认证状态
/// </summary>
public Enums.CertificatedType AuthorCertificated { get; set; }
/// <summary>
/// 显示来源(比如使用终端等)
/// </summary>
public string Source { get; set; }
/// <summary>
/// 附带图片
/// </summary>
public string AttachImg { get; set; }
/// <summary>
/// 附带的外部链接
/// </summary>
public string AttachUrl { get; set; }
/// <summary>
/// 附件
/// </summary>
public string AttachFile { get; set; }
/// <summary>
/// 在媒体中的排序,用以表示顺序或者重要性(例如头版、首页之类)
/// </summary>
public int Rank { get; set; }
#endregion
#region 媒体信息
/// <summary>
/// 页面显示原始媒体名称
/// </summary>
public string ReproducedMediaName { get; set; }
/// <summary>
/// 页面显示原始媒体的Url
/// </summary>
public string ReproducedUrl { get; set; }
#endregion 媒体信息
public RawItem()
{
}
public RawItem(string CrawlID, string Crawler, Enums.ContentDetailLevel DetailLevel)
{
this.CrawlID = CrawlID;
this.Crawler = Crawler;
this.ContentDetailLevel = DetailLevel;
FetchTime = DateTime.Now;
}
public RawItem(string CrawlID, string Crawler, Enums.ContentDetailLevel DetailLevel, string Url)
{
this.CrawlID = CrawlID;
this.Crawler = Crawler;
this.ContentDetailLevel = DetailLevel;
this.Url = Url;
this.ItemID = MD5Helper.getMd5Hash(Url);
FetchTime = DateTime.Now;
}
}
/// <summary>
/// 属性评分结果的数据结构
/// </summary>
public class AttributeEvaluationResult
{
/// <summary>
/// 属性名称
/// </summary>
public string Attribute { get; set; }
/// <summary>
/// 该属性的得分
/// </summary>
public float Value { get; set; }
/// <summary>
/// 命中该属性的词
/// </summary>
public string[] MatchedWords { get; set; }
public AttributeEvaluationResult(string Attribute, float Value, string[] MatchedWords)
{
this.Attribute = Attribute;
this.Value = Value;
this.MatchedWords = MatchedWords;
}
}
/// <summary>
/// 语料标注属性
/// </summary>
[Serializable]
public class ItemLabelData
{
/// <summary>
/// 议题编号
/// </summary>
public string IssueID { get; set; }
public ItemLabelData()
{
}
//todo:语料标注工具设计
}
[
[
{
"CrawlID": "抓取任务ID",
"Crawler": "爬虫",
"FetchTime": "抓取时间",
"UpdateTime": "0001-01-01T00:00:00",
"IndexTime": "0001-01-01T00:00:00",
"ContentDetailLevel": "内容详细程度",
"ItemID": "文章ID",
"ClientItemID": "客户系统或对方网站的ItemID",
"ParentItemID": "父级ItemID",
"Url": "文章Url",
"CleanTitle": "标题",
"ShortTitle": "短标题",
"CleanText": "正文",
"HTMLText": " HTML正文",
"Summary": "高达81%的受访企业表示在未来三年有明确增员计划,该比例较上一年大幅上涨24个百分点,其中预计人员增长10%以上企业比例超过1/3\r\n扩张中的企业出于战略布局的考虑,往往会在不同城市设立分部甚至变更总部所在地",
"PubDate": "页面显示发布时间",
"IP": 0,
"Location": "作者或内容的地点属性",
"PoID": "Location集合的ID,外部的(如点评)设为Url",
"PoIDSource": " LocationID的来源,_前缀表示url未解开,Location集合中没有",
"NearCBD_PoID": "接近某CBD的PoID",
"Lat": "纬度",
"Lon": "经度",
"AuthorName": "作者名(分词索引)",
"AuthorID": "作者ID",
"AuthorImg": "作者图片",
"AuthorCertificated": "实名认证状态",
"AuthorTag": "Author.Tag字段,用于区分人",
"Source": "显示来源(比如使用终端等)",
"AttachImg": "附带图片",
"AttachUrl": "附带的外部链接",
"AttachFile": "附件",
"RetweetID": "转载的ID,如果有的话",
"MediaID": "媒体编号ID",
"MediaName": "21世纪经济报道",
"MediaChannel": "政经",
"Rank": " 在媒体中的排序,用以表示顺序或者重要性(例如头版、首页之类)",
"MediaType":"媒体类型",
"MediaRegionType":"媒体区域类型",
"MediaWeight": " 媒体权重",
"MediaOrganType": "媒体性质",
"MediaStyle": "媒体样式",
"MediaTendency": "媒体倾向性",
"MediaIndustryIDs": "媒体行业ID",
"MediaTag": "媒体标签",
"ProxyZone": "代理分区 >0表示要翻墙",
"ReproducedMediaID": "页面显示原始媒体名称ID",
"ReproducedMediaName": "页面显示原始媒体名称",
"ReproducedUrl": "页面显示原始媒体的Url",
"MediaCopyright": "该媒体版权;0 未知,11 黑名单,12 无版权,13 不允许转载,14 拒绝抓取,20 白名单,21 采购可转授,22 自编,23 开放可商业转载,31 采购需单独授权,32 用户自有版权,33 用户自行承担版权,41 非商业使用",
"Tracking": "跟踪设置和信息(评论)",
"Tracking_Forward": "跟踪设置和信息(转发)",
"CurrentCount": "最新的点击数、评论数、转发数",
"CountHistory": " 定时获取的点击数、评论数、转发数(存ES)",
"Reply": "回复数据",
"DuplicationID": "相似转载ID(早先入库的相似文章ID,第一篇该字段为空,其他均为第一篇的ID)",
"SpliteTitle": "分词结果",
"SpliteText": " 分词结果",
"ProsdDuplication": "是否进行过转载分析",
"Tag": "通用标签,根据各实体库中的Tag字段设置",
"ResearchReport": "研报",
"AnalyzeData语义分析结果字段": [
{
"IssueID": "议题编号",
"MergeMethod": "归并入Issue的方式",
"CategoryIDs": " 用户分类ID(多个)",
"DepartmentIDs": " 用户部门ID(多个)",
"IndustryIDs": "行业ID(多个)",
"Product": " 产品名称(多个,<品牌-系列-品名-型号-类型>路径,用空格或,;隔开,分词索引)",
"TopicIDs": "用户话题ID(多个)",
"OrganizationIDs": "组织机构ID(多个)",
"People": "用户人物名称(多个,分词索引)",
"Region": "地区名称(多个,<省-市-区县-街道村镇>路径,用空格或,;隔开,分词索引)",
"Keywords": "关键词提取",
"Evaluation": "属性评分的结果",
"MarketIDs": " 品类/市场编号",
"BlockIDs": "主题/板块编号",
"StockIDs": "股票编号",
"RelatedEventID": "相关事件编号",
"DuplicationID": "转载编号",
"以下几个标志":"已分析过的标志",
"ProsdMedia": false,
"ProsdSplite": false,
"ProsdTopic": true,
"ProsdCategory": true,
"ProsdDepartment": false,
"ProsdIndustry": true,
"ProsdProduct": true,
"ProsdPeople": false,
"ProsdOrganization": true,
"ProsdRegion": true,
"ProsdSentiment": true,
"ProsdSentiment8": false,
"ProsdSentiment2": false,
"ProsdKeyword": false,
"prosdIG": false,
"Sentiment": "情感分析:正负情感(1 ~ 3)1:正面;2:中性;3:负面",
"EmotionHappy": "高兴",
"EmotionAccept": 0,
"EmotionSurprise": 0,
"EmotionFear": 0,
"EmotionSad": 0,
"EmotionHate": 0,
"EmotionExpect": 0,
"EmotionAngry": 0,
"EmotionPositive": 0,
"EmotionNegative": 0,
"IG": null
}
],
"ManageData管理字段": [
{
"EditorTitle": null,
"EditorText": null,
"Summary": null,
"CategoryID": null,
"IssueID": "greedc",
"IsRead": false,
"IsDeleted": false,
"IsDeletedAnalyzeData": null,
"IsReleasePrimaryUrl": false,
"AlertLevel": 1,
"MessageBox": false,
"Mark": 0,
"Tag": null,
"IsTrack": false,
"Order": "2018-04-24T17:53:26.9511106+08:00",
"Sort": 0,
"HandleStatus": 0,
"ReviewedByUserID": null,
"LastReviewedTime": "2018-04-24T17:53:26.9511106+08:00",
"ReviewedAlert": false,
"ReviewedCategory": false,
"ReviewedDepartment": false,
"ReviewedRegion": false,
"ReviewedIndustry": false,
"ReviewedProduct": false,
"ReviewedPeople": false,
"ZyyicItemData": null
}
],
"PublishData发布字段": [
{
"IssueID": "Reader",
"HTML": null,
"ToRelease": false,
"IsReleased": false,
"ReleaseTime": "0001-01-01T00:00:00",
"Level": 0,
"HasImgAtt": false,
"IsIndex": false,
"IsPic": false,
"FileName": null,
"IsDeleted": false,
"SinglePic": null,
"BigPic": "b/ca9166dbd4c3af6879dc829357933b19_big.jpg",
"YieldPic": null,
"TopPic": null,
"ManyPic": null,
"OperationName": null,
"OperationTime": "2018-04-24T17:55:04.5467767+08:00",
"DisplayTag": [
"城市",
"二线",
"写字楼",
"产业",
"房地产",
"服务商",
"高新",
"科技",
"产值"
]
}
],
"LabelData": " 语料标注字段",
"ReferredAuthorIDs": "提及(@)的用户(不含转发的@)",
"IsOriginalDeleted": "原文是否已被删除",
"IsEPRPublished": "是否是epr公司发布的软广",
"Struct": "文本结构化信息"
}
]
\ No newline at end of file
/// <summary>
/// <summary>
/// 舆情管理字段(每Issue可不同)
/// </summary>
[Serializable]
public class ItemManageData
{
/// <summary>
/// 修改后的标题
/// </summary>
public string EditorTitle { get; set; }
/// <summary>
/// 修改后的正文
/// </summary>
public string EditorText { get; set; }
/// <summary>
/// 摘要
/// </summary>
public string Summary { get; set; }
/// <summary>
/// 原始的频道
/// </summary>
public string CategoryID { get; set; }
/// <summary>
/// 议题编号
/// </summary>
public string IssueID { get; set; }
/// <summary>
/// 已读
/// </summary>
public Boolean IsRead { get; set; }
/// <summary>
/// 客户删除
/// </summary>
public Boolean IsDeleted { get; set; }
/// <summary>
/// 被删除的分析内容(作为删除标志的bug的替补,现用)
/// </summary>
public ItemAnalyzeData IsDeletedAnalyzeData { get; set; }
/// <summary>
/// 是否发布原文链接
/// </summary>
public bool IsReleasePrimaryUrl { get; set; }
/// <summary>
/// 预警级别(0不预警)
/// </summary>
public SByte AlertLevel { get; set; }
/// <summary>
/// 是否弹出提示窗口
/// </summary>
public bool MessageBox { get; set; }
/// <summary>
/// 星标(多级别)
/// </summary>
public SByte Mark { get; set; }
/// <summary>
/// 用户自定义标签
/// </summary>
public string Tag { get; set; }
/// <summary>
/// 是否被设为跟踪
/// </summary>
public bool IsTrack { get; set; }
/// <summary>
/// 排序
/// </summary>
public DateTime Order { get; set; }
/// <summary>
/// 排序
/// </summary>
public int Sort { get; set; }
/// <summary>
/// 处理流程状态
/// </summary>
public SByte HandleStatus { get; set; }
/// <summary>
/// 修改人的UserID
/// </summary>
public string ReviewedByUserID { get; set; }
/// <summary>
/// 最后更新时间
/// </summary>
public DateTime LastReviewedTime { get; set; }
/// <summary>
/// 预警是否审核过
/// </summary>
public Boolean ReviewedAlert { get; set; }
/// <summary>
/// 分类人工审阅标志
/// </summary>
public Boolean ReviewedCategory { get; set; }
/// <summary>
/// 部门人工审阅标志
/// </summary>
public Boolean ReviewedDepartment { get; set; }
/// <summary>
/// 地区人工审阅标志
/// </summary>
public Boolean ReviewedRegion { get; set; }
/// <summary>
/// 行业人工审阅标志
/// </summary>
public Boolean ReviewedIndustry { get; set; }
/// <summary>
/// 产品人工审阅标志
/// </summary>
public Boolean ReviewedProduct { get; set; }
/// <summary>
/// 人名人工审阅标志
/// </summary>
public Boolean ReviewedPeople { get; set; }
/// <summary>
/// 智又盈定制字段!
/// </summary>
public ZyyicItemData ZyyicItemData { get; set; }
}
/// <summary>
/// <summary>
/// 发布管理属性(每Issue可不同)
/// </summary>
[Serializable]
public class ItemPublishData
{
public ItemPublishData()
{
}
/// <summary>
/// 议题编号
/// </summary>
public string IssueID { get; set; }
/// <summary>
/// 用于发布的HTML
/// </summary>
public string HTML { get; set; }
/// <summary>
/// 在待审核中/可供发布
/// </summary>
public bool ToRelease { get; set; }
/// <summary>
/// 标记已发布
/// </summary>
public bool IsReleased { get; set; }
/// <summary>
/// 显示发布时间
/// </summary>
[BsonDateTimeOptions(Kind = DateTimeKind.Local)]
public DateTime ReleaseTime { get; set; }
/// <summary>
/// 文章等级(表示热门等)
/// </summary>
public sbyte Level { get; set; }
/// <summary>
/// 包含图片附件
/// </summary>
public bool HasImgAtt { get; set; }
/// <summary>
/// 是否设为首页
/// </summary>
public bool IsIndex { get; set; }
/// <summary>
/// 是否包含图片
/// </summary>
public bool IsPic { get; set; }
/// <summary>
/// 文件名称
/// </summary>
public string FileName { get; set; }
/// <summary>
/// 是否删除
/// </summary>
public bool IsDeleted { get; set; }
/// <summary>
/// 单图模式
/// </summary>
public string SinglePic { get; set; }
/// <summary>
/// 大图模式 552*396
/// </summary>
public string BigPic { get; set; }
/// <summary>
/// 专题图片 395*168
/// </summary>
public string YieldPic { get; set; }
/// <summary>
/// 头条图片 600*320
/// </summary>
public string TopPic { get; set; }
/// <summary>
/// 多图模式
/// </summary>
public string[] ManyPic { get; set; }
/// <summary>
/// 操作人
/// </summary>
public string[] OperationName { get; set; }
/// <summary>
/// 操作时间
/// </summary>
public DateTime OperationTime { get; set; }
/// <summary>
/// 用来阅读器显示标签
/// </summary>
public string[] DisplayTag { get; set; }
}
@property
def MediaName(self):
return self.__MediaName
@MediaName.setter
def MediaName(self,MediaName):
self.__MediaName = str(MediaName)
@property
def ItemAnalyzeDataList(self):
return self.__ItemAnalyzeDataList
@ItemAnalyzeDataList.setter
def ItemAnalyzeDataList(self,ItemAnalyzeDataList):
self.__ItemAnalyzeDataList = [](ItemAnalyzeDataList)
@property
def MediaOrganType(self):
return self.__MediaOrganType
@MediaOrganType.setter
def MediaOrganType(self,MediaOrganType):
self.__MediaOrganType = int(MediaOrganType)
\ No newline at end of file
@property
@property
def MediaName(self):
return self.__MediaName
@MediaName.setter
def MediaName(self,MediaName):
self.__MediaName = str(MediaName)
\ No newline at end of file
File added
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment