1 // OpenTween - Client of Twitter
2 // Copyright (c) 2015 kim_upsilon (@kim_upsilon) <https://upsilo.net/~upsilon/>
3 // All rights reserved.
5 // This file is part of OpenTween.
7 // This program is free software; you can redistribute it and/or modify it
8 // under the terms of the GNU General Public License as published by the Free
9 // Software Foundation; either version 3 of the License, or (at your option)
12 // This program is distributed in the hope that it will be useful, but
13 // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 // You should have received a copy of the GNU General Public License along
18 // with this program. If not, see <http://www.gnu.org/licenses/>, or write to
19 // the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
20 // Boston, MA 02110-1301, USA.
23 using System.Collections.Generic;
26 using System.Text.RegularExpressions;
27 using System.Threading.Tasks;
32 public static class TweetExtractor
35 /// テキストから URL を抽出して返します
37 public static IEnumerable<string> ExtractUrls(string text)
39 return ExtractUrlEntities(text).Select(x => x.Url);
43 /// テキストから URL を抽出してエンティティとして返します
45 public static IEnumerable<TwitterEntityUrl> ExtractUrlEntities(string text)
47 var urlMatches = Regex.Matches(text, Twitter.rgUrl, RegexOptions.IgnoreCase).Cast<Match>();
48 foreach (var m in urlMatches)
50 var before = m.Groups["before"].Value;
51 var url = m.Groups["url"].Value;
52 var protocol = m.Groups["protocol"].Value;
53 var domain = m.Groups["domain"].Value;
54 var path = m.Groups["path"].Value;
57 if (protocol.Length == 0)
59 if (Regex.IsMatch(before, Twitter.url_invalid_without_protocol_preceding_chars))
62 string lasturl = null;
64 var last_url_invalid_match = false;
65 var domainMatches = Regex.Matches(domain, Twitter.url_valid_ascii_domain, RegexOptions.IgnoreCase).Cast<Match>();
66 foreach (var mm in domainMatches)
69 last_url_invalid_match = Regex.IsMatch(lasturl, Twitter.url_invalid_short_domain, RegexOptions.IgnoreCase);
70 if (!last_url_invalid_match)
76 if (last_url_invalid_match && path.Length != 0)
88 var startPos = m.Groups["url"].Index;
89 var endPos = startPos + m.Groups["url"].Length;
91 yield return new TwitterEntityUrl
93 Indices = new[] { startPos, endPos },