From fd30db4aa253b88c47212ce650bdb9fd306aea34 Mon Sep 17 00:00:00 2001 From: wook Date: Wed, 10 May 2017 13:33:58 +0900 Subject: [PATCH] new pattern for url --- HTMLParser.Example/Parsed.aspx | 5 ++- HTMLParser.Example/Parsed.aspx.cs | 7 ++-- HTMLParser/ParserEx.cs | 54 +++++++++++++++++++++++++++++-- 3 files changed, 56 insertions(+), 10 deletions(-) diff --git a/HTMLParser.Example/Parsed.aspx b/HTMLParser.Example/Parsed.aspx index eaeeab2..38a4d8d 100644 --- a/HTMLParser.Example/Parsed.aspx +++ b/HTMLParser.Example/Parsed.aspx @@ -6,9 +6,8 @@ diff --git a/HTMLParser.Example/Parsed.aspx.cs b/HTMLParser.Example/Parsed.aspx.cs index 44963d0..af9c477 100644 --- a/HTMLParser.Example/Parsed.aspx.cs +++ b/HTMLParser.Example/Parsed.aspx.cs @@ -20,10 +20,9 @@ namespace HTMLParser.Example { ParserEx parse = new ParserEx(); - this.ParsedUrl.InnerHtml = parse.ParseUrl(this.txtSource.Value); - this.parsedYoutube.InnerHtml = parse.ParseYoutube(this.txtSource.Value); - this.parsedUrlAndYoutube.InnerHtml = parse.ParseUrlAndYoutube(this.txtSource.Value); + this.ParsedUrl.InnerHtml = parse.ParseUrl(this.txtSource.Value); + this.parsedYoutube.InnerHtml = parse.GenerateYoutubeScripts(this.txtSource.Value).First(); + this.parsedUrlAndYoutube.InnerHtml = parse.GenerateYoutubeScripts(this.txtSource.Value).First() + parse.ParseUrl(this.txtSource.Value); } - } } \ No newline at end of file diff --git a/HTMLParser/ParserEx.cs b/HTMLParser/ParserEx.cs index 3a03f2c..2af4dcb 100644 --- a/HTMLParser/ParserEx.cs +++ b/HTMLParser/ParserEx.cs @@ -1,20 +1,52 @@ using System; +using System.Collections.Generic; using System.Text.RegularExpressions; namespace HTMLParser { public class ParserEx { + /// + /// Prevent use of Html tags + /// + /// + /// + public string PreventHTML(string article) + { + return article.Replace("<", "<").Replace(">", ">"); + } + + /// + /// Prevent use of Risky Tags + /// + /// + /// + public string PreventRiskyTag(string article) + { + return article.Replace("script", "").Replace("iframe", "").Replace("object", ""); + } + + /// + /// Parse Url + /// + /// + /// public string ParseUrl(string article) { if (string.IsNullOrEmpty(article)) return string.Empty; - string Pttrn = @"(((http|https|ftp|telnet|news)://|www\.)[^youtube][a-z0-9-]+.[][a-zA-Z0-9:&#@=_~%;?/.+-]+)"; - string Lnk = "$1"; + string pttrn = @"((?:(?:https?|http|ftp|gopher|telnet|file|notes|ms-help):(?://|\\\\)(?:www\.)?|www\.)[\w\d:#@%/;$()~_?\+,\-=\\.&]+)"; - return Regex.Replace(article, Pttrn, Lnk, RegexOptions.IgnoreCase, TimeSpan.FromMilliseconds(150)).Replace("href=\"www.", "href=\"http://www."); + string lnk = "$1"; + + return Regex.Replace(article, pttrn, lnk, RegexOptions.IgnoreCase, TimeSpan.FromMilliseconds(150)).Replace("href=\"www.", "href=\"http://www."); } + /// + /// Parse Youtube Url to script + /// + /// + /// public string ParseYoutube(string article) { if (string.IsNullOrEmpty(article)) return string.Empty; @@ -25,6 +57,22 @@ namespace HTMLParser return Regex.Replace(article, pttrn, script, RegexOptions.IgnoreCase, TimeSpan.FromMilliseconds(150)).Replace("https://
GenerateYoutubeScripts(string article) + { + if (string.IsNullOrEmpty(article)) return null; + + Regex regex = new Regex(@"youtu(?:\.be|be\.com)/(?:.*v(?:/|=)|(?:.*/)?)([a-zA-Z0-9-_]+)"); + Match match = regex.Match(article); + List scripts = new List(); + while (match.Success) + { + scripts.Add(string.Format("
", match.Value.Replace("youtu.be/", ""))); + match = match.NextMatch(); + } + + return scripts; + } + public string ParseUrlAndYoutube(string article) { return this.ParseYoutube(this.ParseUrl(article));