2009年5月12日星期二
正文抽取所需正则_skeryl的空间
///summary
///去掉所有html标签
////summary
privatestaticreadonlyRegexFilterAll=newRegex(
@(\[([^=]*)(=[^\]]*)?\][\s\S]*?\[/\1\])|(?lj(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,.);])a\s+[^]*[^]{2,}/a(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,.);]))|(?Stylestyle[\s\S]+?/style)|(?selectselect[\s\S]+?/select)|(?Scriptscript[\s\S]*?/script)|(?Explein\!\-\-[\s\S]*?\-\-)|(?lili(\s+[^]+)?[\s\S]*?/li)|(?Html/?\s*[^]+(\s*[^=]+?=[']?[^']+?[']?)*?[^\[]*)|(?Other[a-zA-Z]+;)|(?Other2\#[a-z0-9]{6})|(?Space\s+)|(\\#\d+\;),
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);//(?Linka[\s\S]*?/a)|
//(?Stylestyle[\s\S]+?/style)|(?selectselect[\s\S]+?/select)|(?Scriptscript[\s\S]*?/script)|(?Explein\!\-\-[\s\S]*?\-\-)|(?lili(\s+[^]+)?[\s\S]*?/li)|(?Html/?\s*[^]+(\s*[^=]+?=[']?[^']+?[']?)*?[^\[]*)|(?Other[a-zA-Z]+;)|(?Other2\#[a-z0-9]{6})|(?Space\s+)
///summary
///找出title标签
////summary
privatestaticreadonlyRegexFindTitle=newRegex(
@\s*/?title\s*,
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);
///summary
///找出title标签内容
////summary
privatestaticreadonlyRegexFindTitleContent=newRegex(
@\s*/?title\s*(?Content[\s\S]*?)\s*/?title\s*,
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);
///summary
///找出h和Strong标签
////summary
privatestaticreadonlyRegexFindHStrong=newRegex(
@\s*/?h\s*|\s*/?strong\s*,
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);
///summary
///找出p和br标签
////summary
privatestaticreadonlyRegexFindPB=newRegex(
@\s*/?p\s*|\s*br\s*/?|\s*/?tr\s*,
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);
///summary
///找出nbsp标签
////summary
privatestaticreadonlyRegexFindNbsp=newRegex(
@nbsp,
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);
///summary
///找出结尾标签
////summary
privatestaticreadonlyRegexFindS=newRegex(
@(?Content[\s\S]*?)\$,
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);
///summary
///找出是否为标准句
////summary
privatestaticreadonlyRegexIsSen=newRegex(
@[,.,。!!;;::……??《》],
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);
///summary
///找出是否为垃圾句[strong][h]标签过多的
////summary
privatestaticreadonlyRegexIsWs=newRegex(
@\[\(h\)\],
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);
///summary
///找出是否为垃圾句冒号和·-过多的
////summary
privatestaticreadonlyRegexIsWsM=newRegex(
@\[·]|[-]|[::],
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);
///summary
///找出是否为BBS特征
////summary
privatestaticreadonlyRegexIsBbsInfo=newRegex(
@第[^楼]{1,50}楼|Powered\s*/?by[\s\S]*?Dvbbs|Powered\s*/?by[\s\S]*?Discuz,
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);
///summary
///取KEYWORD
////summary
privatestaticreadonlyRegexmKeyWord=newRegex(
@meta\s*name\s*=\s*[']?keywords[']?\s*content\s*=\s*[']?(?KeyWords[^']*)[']?[^]*|meta\s*content\s*=\s*[']?(?KeyWords[^']*)[']?\s*name\s*=\s*[']?keywords[']?\s*[^]*
,RegexOptions.ExplicitCapture|RegexOptions.Multiline|RegexOptions.IgnoreCase);
///summary
///取DESCRIPTION
////summary
privatestaticreadonlyRegexmDescription=newRegex(
@meta\s*name\s*=\s*[']?description[']?\s*content\s*=\s*[']?(?description[^']*)[']?[^]*|meta\s*content\s*=\s*[']?(?description[^']*)[']?\s*name\s*=\s*[']?description[']?\s*[^]*
,RegexOptions.ExplicitCapture|RegexOptions.Multiline|RegexOptions.IgnoreCase);
///summary
///取Tags
////summary
privatestaticreadonlyRegexmTag=newRegex(
@meta\s*name\s*=\s*[']?tagwords[']?\s*content\s*=\s*[']?(?tagwords[^']*)[']?[^]*|meta\s*content\s*=\s*[']?(?tagwords[^']*)[']?\s*name\s*=\s*[']?tagwords[']?\s*[^]*
,RegexOptions.ExplicitCapture|RegexOptions.Multiline|RegexOptions.IgnoreCase);
///summary
///找出是否为垃圾句:后字符号过少,:号前无说字,:号后无关于
////summary
privatestaticreadonlyRegexIsWsMM=newRegex(
@^[^说\s]{0,8}?[::].{0,10}$,
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);
///summary
///找出spider写入的url标记
////summary
privatestaticreadonlyRegextxtUrl=newRegex(
@当前URL为:http://(?URL.*),
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);
///summary
///找出spider写入的锚点描述标记
////summary
privatestaticreadonlyRegextxtDescription=newRegex(
@当前链接描述为:(?Describe.*),
RegexOptions.ExplicitCapture
|RegexOptions.Multiline
|RegexOptions.IgnoreCase);
/////summary
/////取需要a标签
//////summary
//privatestaticreadonlyRegexcleanFirst=newRegex(
//@([\u4E00-\u9FA5]|[\uFE30-\uFFA0]|[,.);])(?Robbish1a\s+[^]*)[^]{1,6}(?Robbish2/a)([\u4E00-\u9FA5]|[\uFE30-\uFFA0]|[,.);]),RegexOptions.ExplicitCapture|RegexOptions.Multiline|RegexOptions.IgnoreCase);
ion
||浏览()|(0)最近读者:网友评论:发表评论:姓名:*姓名最长为50字节网址或邮箱:(选填)内容:验证码:请点击后输入四位验证码,字母不区分大小写
订阅:
博文评论 (Atom)
0 评论:
发表评论