OSDN Git Service

テキストからURLを抽出してTwitterEntityUrlとして出力するExtractUrlEntitiesメソッドを追加
authorKimura Youichi <kim.upsilon@bucyou.net>
Tue, 20 Oct 2015 08:06:40 +0000 (17:06 +0900)
committerKimura Youichi <kim.upsilon@bucyou.net>
Tue, 20 Oct 2015 13:26:29 +0000 (22:26 +0900)
OpenTween.Tests/TweetExtractorTest.cs
OpenTween/TweetExtractor.cs

index 2a423e2..74b12d6 100644 (file)
@@ -24,6 +24,7 @@ using System.Collections.Generic;
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
+using OpenTween.Api;
 using Xunit;
 
 namespace OpenTween
@@ -50,5 +51,53 @@ namespace OpenTween
             // ただし、末尾にパスが続く場合は t.co に短縮される
             Assert.Equal(new[] { "example.jp/hogehoge" }, TweetExtractor.ExtractUrls("example.jp/hogehoge"));
         }
+
+        [Fact]
+        public void ExtractUrlEntities_Test()
+        {
+            var entity = TweetExtractor.ExtractUrlEntities("hogehoge http://example.com/").Single();
+
+            Assert.Equal(new[] { 9, 28 }, entity.Indices);
+            Assert.Equal("http://example.com/", entity.Url);
+            Assert.Equal("http://example.com/", entity.ExpandedUrl);
+            Assert.Equal("http://example.com/", entity.DisplayUrl);
+        }
+
+        [Fact]
+        public void ExtractUrlEntities_SurrogatePairTest()
+        {
+            var entity = TweetExtractor.ExtractUrlEntities("✨ http://example.com/ ✨").Single();
+
+            Assert.Equal(new[] { 2, 21 }, entity.Indices);
+            Assert.Equal("http://example.com/", entity.Url);
+            Assert.Equal("http://example.com/", entity.ExpandedUrl);
+            Assert.Equal("http://example.com/", entity.DisplayUrl);
+        }
+
+        [Fact]
+        public void ExtractUrlEntities_CompositeCharacterTest()
+        {
+            // 合成文字 é ( \u00e9 ) を含むツイート (1文字としてカウントする)
+            // 参照: https://dev.twitter.com/issues/251
+            var entity = TweetExtractor.ExtractUrlEntities("Caf\u00e9 http://example.com/").Single();
+
+            Assert.Equal(new[] { 5, 24 }, entity.Indices);
+            Assert.Equal("http://example.com/", entity.Url);
+            Assert.Equal("http://example.com/", entity.ExpandedUrl);
+            Assert.Equal("http://example.com/", entity.DisplayUrl);
+        }
+
+        [Fact]
+        public void ExtractUrlEntities_CombiningCharacterSequenceTest()
+        {
+            // 結合文字列 é ( e + \u0301 ) を含むツイート (2文字としてカウントする)
+            // 参照: https://dev.twitter.com/issues/251
+            var entity = TweetExtractor.ExtractUrlEntities("Cafe\u0301 http://example.com/").Single();
+
+            Assert.Equal(new[] { 6, 25 }, entity.Indices);
+            Assert.Equal("http://example.com/", entity.Url);
+            Assert.Equal("http://example.com/", entity.ExpandedUrl);
+            Assert.Equal("http://example.com/", entity.DisplayUrl);
+        }
     }
 }
index 57cc422..6a6bfa6 100644 (file)
@@ -25,6 +25,7 @@ using System.Linq;
 using System.Text;
 using System.Text.RegularExpressions;
 using System.Threading.Tasks;
+using OpenTween.Api;
 
 namespace OpenTween
 {
@@ -35,6 +36,14 @@ namespace OpenTween
         /// </summary>
         public static IEnumerable<string> ExtractUrls(string text)
         {
+            return ExtractUrlEntities(text).Select(x => x.Url);
+        }
+
+        /// <summary>
+        /// テキストから URL を抽出してエンティティとして返します
+        /// </summary>
+        public static IEnumerable<TwitterEntityUrl> ExtractUrlEntities(string text)
+        {
             var urlMatches = Regex.Matches(text, Twitter.rgUrl, RegexOptions.IgnoreCase).Cast<Match>();
             foreach (var m in urlMatches)
             {
@@ -43,12 +52,13 @@ namespace OpenTween
                 var protocol = m.Groups["protocol"].Value;
                 var domain = m.Groups["domain"].Value;
                 var path = m.Groups["path"].Value;
+
+                var validUrl = false;
                 if (protocol.Length == 0)
                 {
                     if (Regex.IsMatch(before, Twitter.url_invalid_without_protocol_preceding_chars))
                         continue;
 
-                    var validUrl = false;
                     string lasturl = null;
 
                     var last_url_invalid_match = false;
@@ -67,15 +77,24 @@ namespace OpenTween
                     {
                         validUrl = true;
                     }
-
-                    if (validUrl)
-                    {
-                        yield return url;
-                    }
                 }
                 else
                 {
-                    yield return url;
+                    validUrl = true;
+                }
+
+                if (validUrl)
+                {
+                    var startPos = m.Groups["url"].Index;
+                    var endPos = startPos + m.Groups["url"].Length;
+
+                    yield return new TwitterEntityUrl
+                    {
+                        Indices = new[] { startPos, endPos },
+                        Url = url,
+                        ExpandedUrl = url,
+                        DisplayUrl = url,
+                    };
                 }
             }
         }