using System.Linq;
using System.Text;
using System.Threading.Tasks;
+using OpenTween.Api;
using Xunit;
namespace OpenTween
// ただし、末尾にパスが続く場合は t.co に短縮される
Assert.Equal(new[] { "example.jp/hogehoge" }, TweetExtractor.ExtractUrls("example.jp/hogehoge"));
}
+
+ [Fact]
+ public void ExtractUrlEntities_Test()
+ {
+ var entity = TweetExtractor.ExtractUrlEntities("hogehoge http://example.com/").Single();
+
+ Assert.Equal(new[] { 9, 28 }, entity.Indices);
+ Assert.Equal("http://example.com/", entity.Url);
+ Assert.Equal("http://example.com/", entity.ExpandedUrl);
+ Assert.Equal("http://example.com/", entity.DisplayUrl);
+ }
+
+ [Fact]
+ public void ExtractUrlEntities_SurrogatePairTest()
+ {
+ var entity = TweetExtractor.ExtractUrlEntities("✨ http://example.com/ ✨").Single();
+
+ Assert.Equal(new[] { 2, 21 }, entity.Indices);
+ Assert.Equal("http://example.com/", entity.Url);
+ Assert.Equal("http://example.com/", entity.ExpandedUrl);
+ Assert.Equal("http://example.com/", entity.DisplayUrl);
+ }
+
+ [Fact]
+ public void ExtractUrlEntities_CompositeCharacterTest()
+ {
+ // 合成文字 é ( \u00e9 ) を含むツイート (1文字としてカウントする)
+ // 参照: https://dev.twitter.com/issues/251
+ var entity = TweetExtractor.ExtractUrlEntities("Caf\u00e9 http://example.com/").Single();
+
+ Assert.Equal(new[] { 5, 24 }, entity.Indices);
+ Assert.Equal("http://example.com/", entity.Url);
+ Assert.Equal("http://example.com/", entity.ExpandedUrl);
+ Assert.Equal("http://example.com/", entity.DisplayUrl);
+ }
+
+ [Fact]
+ public void ExtractUrlEntities_CombiningCharacterSequenceTest()
+ {
+ // 結合文字列 é ( e + \u0301 ) を含むツイート (2文字としてカウントする)
+ // 参照: https://dev.twitter.com/issues/251
+ var entity = TweetExtractor.ExtractUrlEntities("Cafe\u0301 http://example.com/").Single();
+
+ Assert.Equal(new[] { 6, 25 }, entity.Indices);
+ Assert.Equal("http://example.com/", entity.Url);
+ Assert.Equal("http://example.com/", entity.ExpandedUrl);
+ Assert.Equal("http://example.com/", entity.DisplayUrl);
+ }
}
}
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
+using OpenTween.Api;
namespace OpenTween
{
/// </summary>
public static IEnumerable<string> ExtractUrls(string text)
{
+ return ExtractUrlEntities(text).Select(x => x.Url);
+ }
+
+ /// <summary>
+ /// テキストから URL を抽出してエンティティとして返します
+ /// </summary>
+ public static IEnumerable<TwitterEntityUrl> ExtractUrlEntities(string text)
+ {
var urlMatches = Regex.Matches(text, Twitter.rgUrl, RegexOptions.IgnoreCase).Cast<Match>();
foreach (var m in urlMatches)
{
var protocol = m.Groups["protocol"].Value;
var domain = m.Groups["domain"].Value;
var path = m.Groups["path"].Value;
+
+ var validUrl = false;
if (protocol.Length == 0)
{
if (Regex.IsMatch(before, Twitter.url_invalid_without_protocol_preceding_chars))
continue;
- var validUrl = false;
string lasturl = null;
var last_url_invalid_match = false;
{
validUrl = true;
}
-
- if (validUrl)
- {
- yield return url;
- }
}
else
{
- yield return url;
+ validUrl = true;
+ }
+
+ if (validUrl)
+ {
+ var startPos = m.Groups["url"].Index;
+ var endPos = startPos + m.Groups["url"].Length;
+
+ yield return new TwitterEntityUrl
+ {
+ Indices = new[] { startPos, endPos },
+ Url = url,
+ ExpandedUrl = url,
+ DisplayUrl = url,
+ };
}
}
}