OSDN Git Service

TweetExtractorクラスにハッシュタグとメンションを抽出するメソッドを追加
authorKimura Youichi <kim.upsilon@bucyou.net>
Tue, 20 Oct 2015 09:10:44 +0000 (18:10 +0900)
committerKimura Youichi <kim.upsilon@bucyou.net>
Tue, 20 Oct 2015 13:26:54 +0000 (22:26 +0900)
正規表現は Twitter.CreateHtmlAnchorAsync で使用していたパターンを元にした

OpenTween.Tests/TweetExtractorTest.cs
OpenTween/TweetExtractor.cs
OpenTween/Twitter.cs

index 74b12d6..c517a47 100644 (file)
@@ -99,5 +99,32 @@ namespace OpenTween
             Assert.Equal("http://example.com/", entity.ExpandedUrl);
             Assert.Equal("http://example.com/", entity.DisplayUrl);
         }
+
+        [Fact]
+        public void ExtractMentionEntities_Test()
+        {
+            var entity = TweetExtractor.ExtractMentionEntities("hogehoge @twitterapi").Single();
+
+            Assert.Equal(new[] { 9, 20 }, entity.Indices);
+            Assert.Equal("@twitterapi", entity.ScreenName);
+        }
+
+        [Fact]
+        public void ExtractMentionEntities_ListTest()
+        {
+            var entity = TweetExtractor.ExtractMentionEntities("hogehoge @twitter/developers").Single();
+
+            Assert.Equal(new[] { 9, 28 }, entity.Indices);
+            Assert.Equal("@twitter/developers", entity.ScreenName);
+        }
+
+        [Fact]
+        public void ExtractHashtagEntities_Test()
+        {
+            var entity = TweetExtractor.ExtractHashtagEntities("hogehoge #test").Single();
+
+            Assert.Equal(new[] { 9, 14 }, entity.Indices);
+            Assert.Equal("#test", entity.Text);
+        }
     }
 }
index 6a6bfa6..ee197f0 100644 (file)
@@ -98,5 +98,62 @@ namespace OpenTween
                 }
             }
         }
+
+        /// <summary>
+        /// テキストからメンションを抽出してエンティティとして返します
+        /// </summary>
+        public static IEnumerable<TwitterEntityMention> ExtractMentionEntities(string text)
+        {
+            // リスト
+            var matchesAtList = Regex.Matches(text, @"(^|[^a-zA-Z0-9_/])([@@][a-zA-Z0-9_]{1,20}/[a-zA-Z][a-zA-Z0-9\p{IsLatin-1Supplement}\-]{0,79})");
+            foreach (var match in matchesAtList.Cast<Match>())
+            {
+                var groupMention = match.Groups[2];
+                var startPos = groupMention.Index;
+                var endPos = startPos + groupMention.Length;
+
+                yield return new TwitterEntityMention
+                {
+                    Indices = new[] { startPos, endPos },
+                    ScreenName = groupMention.Value,
+                };
+            }
+
+            // 通常のメンション
+            var matchesAtUser = Regex.Matches(text, "(^|[^a-zA-Z0-9_/])([@@][a-zA-Z0-9_]{1,20})([^a-zA-Z0-9_/]|$)");
+            foreach (var match in matchesAtUser.Cast<Match>())
+            {
+                var groupMention = match.Groups[2];
+                var startPos = groupMention.Index;
+                var endPos = startPos + groupMention.Length;
+
+                yield return new TwitterEntityMention
+                {
+                    Indices = new[] { startPos, endPos },
+                    ScreenName = groupMention.Value,
+                };
+            }
+        }
+
+        /// <summary>
+        /// テキストからハッシュタグを抽出してエンティティとして返します
+        /// </summary>
+        public static IEnumerable<TwitterEntityHashtag> ExtractHashtagEntities(string text)
+        {
+            var matches = Regex.Matches(text, Twitter.HASHTAG);
+            foreach (var match in matches.Cast<Match>())
+            {
+                var groupHashtagSharp = match.Groups[2];
+                var groupHashtagText = match.Groups[3];
+                var startPos = groupHashtagSharp.Index;
+                var endPos = startPos + groupHashtagSharp.Length + groupHashtagText.Length;
+
+                yield return new TwitterEntityHashtag
+                {
+                    Indices = new[] { startPos, endPos },
+                    Text = groupHashtagSharp.Value + groupHashtagText.Value,
+                };
+            }
+        }
     }
 }
index 9029088..fbbbdbf 100644 (file)
@@ -2677,136 +2677,6 @@ namespace OpenTween
             this.CheckStatusCode(res, content);
         }
 
-        private class range
-        {
-            public int fromIndex { get; set; }
-            public int toIndex { get; set; }
-            public range(int fromIndex, int toIndex)
-            {
-                this.fromIndex = fromIndex;
-                this.toIndex = toIndex;
-            }
-        }
-        public async Task<string> CreateHtmlAnchorAsync(string Text, List<string> AtList, Dictionary<string, string> media)
-        {
-            if (Text == null) return null;
-            var retStr = Text.Replace("&gt;", "<<<<<tweenだいなり>>>>>").Replace("&lt;", "<<<<<tweenしょうなり>>>>>");
-            //uriの正規表現
-            //const string url_valid_domain = "(?<domain>(?:[^\p{P}\s][\.\-_](?=[^\p{P}\s])|[^\p{P}\s]){1,}\.[a-z]{2,}(?::[0-9]+)?)"
-            //const string url_valid_general_path_chars = "[a-z0-9!*';:=+$/%#\[\]\-_,~]"
-            //const string url_balance_parens = "(?:\(" + url_valid_general_path_chars + "+\))"
-            //const string url_valid_url_path_ending_chars = "(?:[a-z0-9=_#/\-\+]+|" + url_balance_parens + ")"
-            //const string pth = "(?:" + url_balance_parens +
-            //    "|@" + url_valid_general_path_chars + "+/" +
-            //    "|[.,]?" + url_valid_general_path_chars + "+" +
-            //    ")"
-            //const string pth2 = "(/(?:" +
-            //    pth + "+" + url_valid_url_path_ending_chars + "|" +
-            //    pth + "+" + url_valid_url_path_ending_chars + "?|" +
-            //    url_valid_url_path_ending_chars +
-            //    ")?)?"
-            //const string qry = "(?<query>\?[a-z0-9!*'();:&=+$/%#\[\]\-_.,~]*[a-z0-9_&=#])?"
-            //const string rgUrl = "(?<before>(?:[^\""':!=#]|^|\:/))" +
-            //                            "(?<url>(?<protocol>https?://)" +
-            //                            url_valid_domain +
-            //                            pth2 +
-            //                            qry +
-            //                            ")"
-            //const string rgUrl = "(?<before>(?:[^\""':!=#]|^|\:/))" +
-            //                            "(?<url>(?<protocol>https?://|www\.)" +
-            //                            url_valid_domain +
-            //                            pth2 +
-            //                            qry +
-            //                            ")"
-            //絶対パス表現のUriをリンクに置換
-            retStr = await new Regex(rgUrl, RegexOptions.IgnoreCase).ReplaceAsync(retStr, async mu =>
-            {
-                var sb = new StringBuilder(mu.Result("${before}<a href=\""));
-                //if (mu.Result("${protocol}").StartsWith("w", StringComparison.OrdinalIgnoreCase))
-                //    sb.Append("http://");
-                //}
-                var url = mu.Result("${url}");
-                var title = await ShortUrl.Instance.ExpandUrlAsync(url);
-                sb.Append(url + "\" title=\"" + MyCommon.ConvertToReadableUrl(title) + "\">").Append(url).Append("</a>");
-                if (media != null && !media.ContainsKey(url)) media.Add(url, title);
-                return sb.ToString();
-            });
-
-            //@先をリンクに置換(リスト)
-            retStr = Regex.Replace(retStr,
-                                   @"(^|[^a-zA-Z0-9_/])([@@]+)([a-zA-Z0-9_]{1,20}/[a-zA-Z][a-zA-Z0-9\p{IsLatin-1Supplement}\-]{0,79})",
-                                   "$1$2<a href=\"/$3\">$3</a>");
-
-            var m = Regex.Match(retStr, "(^|[^a-zA-Z0-9_])[@@]([a-zA-Z0-9_]{1,20})");
-            while (m.Success)
-            {
-                if (!AtList.Contains(m.Result("$2").ToLower())) AtList.Add(m.Result("$2").ToLower());
-                m = m.NextMatch();
-            }
-            //@先をリンクに置換
-            retStr = Regex.Replace(retStr,
-                                   "(^|[^a-zA-Z0-9_/])([@@])([a-zA-Z0-9_]{1,20})",
-                                   "$1$2<a href=\"/$3\">$3</a>");
-
-            //ハッシュタグを抽出し、リンクに置換
-            var anchorRange = new List<range>();
-            for (int i = 0; i < retStr.Length; i++)
-            {
-                var index = retStr.IndexOf("<a ", i);
-                if (index > -1 && index < retStr.Length)
-                {
-                    i = index;
-                    var toIndex = retStr.IndexOf("</a>", index);
-                    if (toIndex > -1)
-                    {
-                        anchorRange.Add(new range(index, toIndex + 3));
-                        i = toIndex;
-                    }
-                }
-            }
-            //retStr = Regex.Replace(retStr,
-            //                       "(^|[^a-zA-Z0-9/&])([##])([0-9a-zA-Z_]*[a-zA-Z_]+[a-zA-Z0-9_\xc0-\xd6\xd8-\xf6\xf8-\xff]*)",
-            //                       new MatchEvaluator(Function(mh As Match)
-            //                                              foreach (var rng in anchorRange)
-            //                                              {
-            //                                                  if (mh.Index >= rng.fromIndex &&
-            //                                                   mh.Index <= rng.toIndex) return mh.Result("$0");
-            //                                              }
-            //                                              if (IsNumeric(mh.Result("$3"))) return mh.Result("$0");
-            //                                              lock (LockObj)
-            //                                              {
-            //                                                  _hashList.Add("#" + mh.Result("$3"))
-            //                                              }
-            //                                              return mh.Result("$1") + "<a href=\"" + _protocol + "twitter.com/search?q=%23" + mh.Result("$3") + "\">" + mh.Result("$2$3") + "</a>";
-            //                                          }),
-            //                                      RegexOptions.IgnoreCase)
-            retStr = Regex.Replace(retStr,
-                                   HASHTAG,
-                                   new MatchEvaluator(mh =>
-                                                      {
-                                                          foreach (var rng in anchorRange)
-                                                          {
-                                                              if (mh.Index >= rng.fromIndex &&
-                                                               mh.Index <= rng.toIndex) return mh.Result("$0");
-                                                          }
-                                                          lock (LockObj)
-                                                          {
-                                                              _hashList.Add("#" + mh.Result("$3"));
-                                                          }
-                                                          return mh.Result("$1") + "<a href=\"https://twitter.com/search?q=%23" + mh.Result("$3") + "\">" + mh.Result("$2$3") + "</a>";
-                                                      }),
-                                                  RegexOptions.IgnoreCase);
-
-
-            retStr = Regex.Replace(retStr, "(^|[^a-zA-Z0-9_/&##@@>=.~])(sm|nm)([0-9]{1,10})", "$1<a href=\"http://www.nicovideo.jp/watch/$2$3\">$2$3</a>");
-
-            retStr = retStr.Replace("<<<<<tweenだいなり>>>>>", "&gt;").Replace("<<<<<tweenしょうなり>>>>>", "&lt;");
-
-            //retStr = AdjustHtml(ShortUrl.Resolve(PreProcessUrl(retStr), true)) //IDN置換、短縮Uri解決、@リンクを相対→絶対にしてtarget属性付与
-            retStr = AdjustHtml(PreProcessUrl(retStr)); //IDN置換、短縮Uri解決、@リンクを相対→絶対にしてtarget属性付与
-            return retStr;
-        }
-
         public async Task<string> CreateHtmlAnchorAsync(string text, List<string> AtList, TwitterEntities entities, List<MediaInfo> media)
         {
             if (entities != null)