1 // OpenTween - Client of Twitter
2 // Copyright (c) 2015 kim_upsilon (@kim_upsilon) <https://upsilo.net/~upsilon/>
3 // All rights reserved.
5 // This file is part of OpenTween.
7 // This program is free software; you can redistribute it and/or modify it
8 // under the terms of the GNU General Public License as published by the Free
9 // Software Foundation; either version 3 of the License, or (at your option)
12 // This program is distributed in the hope that it will be useful, but
13 // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 // You should have received a copy of the GNU General Public License along
18 // with this program. If not, see <http://www.gnu.org/licenses/>, or write to
19 // the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
20 // Boston, MA 02110-1301, USA.
23 using System.Collections.Generic;
26 using System.Text.RegularExpressions;
27 using System.Threading.Tasks;
28 using OpenTween.Api.DataModel;
32 public static class TweetExtractor
35 /// テキストから URL を抽出して返します
37 public static IEnumerable<string> ExtractUrls(string text)
39 return ExtractUrlEntities(text).Select(x => x.Url);
43 /// テキストから URL を抽出してエンティティとして返します
45 public static IEnumerable<TwitterEntityUrl> ExtractUrlEntities(string text)
47 var urlMatches = Regex.Matches(text, Twitter.rgUrl, RegexOptions.IgnoreCase).Cast<Match>();
48 foreach (var m in urlMatches)
50 var before = m.Groups["before"].Value;
51 var url = m.Groups["url"].Value;
52 var protocol = m.Groups["protocol"].Value;
53 var domain = m.Groups["domain"].Value;
54 var path = m.Groups["path"].Value;
57 if (protocol.Length == 0)
59 if (Regex.IsMatch(before, Twitter.url_invalid_without_protocol_preceding_chars))
62 string lasturl = null;
64 var last_url_invalid_match = false;
65 var domainMatches = Regex.Matches(domain, Twitter.url_valid_ascii_domain, RegexOptions.IgnoreCase).Cast<Match>();
66 foreach (var mm in domainMatches)
69 last_url_invalid_match = Regex.IsMatch(lasturl, Twitter.url_invalid_short_domain, RegexOptions.IgnoreCase);
70 if (!last_url_invalid_match)
76 if (last_url_invalid_match && path.Length != 0)
88 var startPos = m.Groups["url"].Index;
89 var endPos = startPos + m.Groups["url"].Length;
91 yield return new TwitterEntityUrl
93 Indices = new[] { startPos, endPos },
103 /// テキストからメンションを抽出してエンティティとして返します
105 public static IEnumerable<TwitterEntityMention> ExtractMentionEntities(string text)
108 var matchesAtList = Regex.Matches(text, @"(^|[^a-zA-Z0-9_/])([@@][a-zA-Z0-9_]{1,20}/[a-zA-Z][a-zA-Z0-9\p{IsLatin-1Supplement}\-]{0,79})");
109 foreach (var match in matchesAtList.Cast<Match>())
111 var groupMention = match.Groups[2];
112 var startPos = groupMention.Index;
113 var endPos = startPos + groupMention.Length;
115 yield return new TwitterEntityMention
117 Indices = new[] { startPos, endPos },
118 ScreenName = groupMention.Value.Substring(1), // 先頭の「@」は取り除く
123 var matchesAtUser = Regex.Matches(text, "(^|[^a-zA-Z0-9_/])([@@][a-zA-Z0-9_]{1,20})([^a-zA-Z0-9_/]|$)");
124 foreach (var match in matchesAtUser.Cast<Match>())
126 var groupMention = match.Groups[2];
127 var startPos = groupMention.Index;
128 var endPos = startPos + groupMention.Length;
130 yield return new TwitterEntityMention
132 Indices = new[] { startPos, endPos },
133 ScreenName = groupMention.Value.Substring(1), // 先頭の「@」は取り除く
139 /// テキストからハッシュタグを抽出してエンティティとして返します
141 public static IEnumerable<TwitterEntityHashtag> ExtractHashtagEntities(string text)
143 var matches = Regex.Matches(text, Twitter.HASHTAG);
144 foreach (var match in matches.Cast<Match>())
146 var groupHashtagSharp = match.Groups[2];
147 var groupHashtagText = match.Groups[3];
148 var startPos = groupHashtagSharp.Index;
149 var endPos = startPos + groupHashtagSharp.Length + groupHashtagText.Length;
151 yield return new TwitterEntityHashtag
153 Indices = new[] { startPos, endPos },
154 Text = groupHashtagText.Value,