OSDN Git Service

C# 8.0 のnull許容参照型を有効化
[opentween/open-tween.git] / OpenTween / TweetExtractor.cs
1 // OpenTween - Client of Twitter
2 // Copyright (c) 2015 kim_upsilon (@kim_upsilon) <https://upsilo.net/~upsilon/>
3 // Copyright (c) 2015 takke (@takke) <http://takke.jp/>
4 // All rights reserved.
5 //
6 // This file is part of OpenTween.
7 //
8 // This program is free software; you can redistribute it and/or modify it
9 // under the terms of the GNU General Public License as published by the Free
10 // Software Foundation; either version 3 of the License, or (at your option)
11 // any later version.
12 //
13 // This program is distributed in the hope that it will be useful, but
14 // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 // for more details.
17 //
18 // You should have received a copy of the GNU General Public License along
19 // with this program. If not, see <http://www.gnu.org/licenses/>, or write to
20 // the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
21 // Boston, MA 02110-1301, USA.
22
23 #nullable enable
24
25 using System;
26 using System.Collections.Generic;
27 using System.Linq;
28 using System.Text;
29 using System.Text.RegularExpressions;
30 using System.Threading.Tasks;
31 using OpenTween.Api.DataModel;
32
33 namespace OpenTween
34 {
35     public static class TweetExtractor
36     {
37         public static readonly Regex EmojiPattern = new Regex(@"/(?:\ud83d\udc68\ud83c\udffc\u200d\ud83e\udd1d\u200d\ud83d\udc68\ud83c\udffb|\ud83d\udc68\ud83c\udffd\u200d\ud83e\udd1d\u200d\ud83d\udc68\ud83c[\udffb\udffc]|\ud83d\udc68\ud83c\udffe\u200d\ud83e\udd1d\u200d\ud83d\udc68\ud83c[\udffb-\udffd]|\ud83d\udc68\ud83c\udfff\u200d\ud83e\udd1d\u200d\ud83d\udc68\ud83c[\udffb-\udffe]|\ud83d\udc69\ud83c\udffb\u200d\ud83e\udd1d\u200d\ud83d\udc68\ud83c[\udffc-\udfff]|\ud83d\udc69\ud83c\udffc\u200d\ud83e\udd1d\u200d\ud83d\udc68\ud83c[\udffb\udffd-\udfff]|\ud83d\udc69\ud83c\udffc\u200d\ud83e\udd1d\u200d\ud83d\udc69\ud83c\udffb|\ud83d\udc69\ud83c\udffd\u200d\ud83e\udd1d\u200d\ud83d\udc68\ud83c[\udffb\udffc\udffe\udfff]|\ud83d\udc69\ud83c\udffd\u200d\ud83e\udd1d\u200d\ud83d\udc69\ud83c[\udffb\udffc]|\ud83d\udc69\ud83c\udffe\u200d\ud83e\udd1d\u200d\ud83d\udc68\ud83c[\udffb-\udffd\udfff]|\ud83d\udc69\ud83c\udffe\u200d\ud83e\udd1d\u200d\ud83d\udc69\ud83c[\udffb-\udffd]|\ud83d\udc69\ud83c\udfff\u200d\ud83e\udd1d\u200d\ud83d\udc68\ud83c[\udffb-\udffe]|\ud83d\udc69\ud83c\udfff\u200d\ud83e\udd1d\u200d\ud83d\udc69\ud83c[\udffb-\udffe]|\ud83e\uddd1\ud83c\udffb\u200d\ud83e\udd1d\u200d\ud83e\uddd1\ud83c\udffb|\ud83e\uddd1\ud83c\udffc\u200d\ud83e\udd1d\u200d\ud83e\uddd1\ud83c[\udffb\udffc]|\ud83e\uddd1\ud83c\udffd\u200d\ud83e\udd1d\u200d\ud83e\uddd1\ud83c[\udffb-\udffd]|\ud83e\uddd1\ud83c\udffe\u200d\ud83e\udd1d\u200d\ud83e\uddd1\ud83c[\udffb-\udffe]|\ud83e\uddd1\ud83c\udfff\u200d\ud83e\udd1d\u200d\ud83e\uddd1\ud83c[\udffb-\udfff]|\ud83e\uddd1\u200d\ud83e\udd1d\u200d\ud83e\uddd1|\ud83d\udc6b\ud83c[\udffb-\udfff]|\ud83d\udc6c\ud83c[\udffb-\udfff]|\ud83d\udc6d\ud83c[\udffb-\udfff]|\ud83d[\udc6b-\udc6d])|(?:\ud83d[\udc68\udc69])(?:\ud83c[\udffb-\udfff])?\u200d(?:\u2695\ufe0f|\u2696\ufe0f|\u2708\ufe0f|\ud83c[\udf3e\udf73\udf93\udfa4\udfa8\udfeb\udfed]|\ud83d[\udcbb\udcbc\udd27\udd2c\ude80\ude92]|\ud83e[\uddaf-\uddb3\uddbc\uddbd])|(?:\ud83c[\udfcb\udfcc]|\ud83d[\udd74\udd75]|\u26f9)((?:\ud83c[\udffb-\udfff]|\ufe0f)\u200d[\u2640\u2642]\ufe0f)|(?:\ud83c[\udfc3\udfc4\udfca]|\ud83d[\udc6e\udc71\udc73\udc77\udc81\udc82\udc86\udc87\ude45-\ude47\ude4b\ude4d\ude4e\udea3\udeb4-\udeb6]|\ud83e[\udd26\udd35\udd37-\udd39\udd3d\udd3e\uddb8\uddb9\uddcd-\uddcf\uddd6-\udddd])(?:\ud83c[\udffb-\udfff])?\u200d[\u2640\u2642]\ufe0f|(?:\ud83d\udc68\u200d\u2764\ufe0f\u200d\ud83d\udc8b\u200d\ud83d\udc68|\ud83d\udc68\u200d\ud83d\udc68\u200d\ud83d\udc66\u200d\ud83d\udc66|\ud83d\udc68\u200d\ud83d\udc68\u200d\ud83d\udc67\u200d\ud83d[\udc66\udc67]|\ud83d\udc68\u200d\ud83d\udc69\u200d\ud83d\udc66\u200d\ud83d\udc66|\ud83d\udc68\u200d\ud83d\udc69\u200d\ud83d\udc67\u200d\ud83d[\udc66\udc67]|\ud83d\udc69\u200d\u2764\ufe0f\u200d\ud83d\udc8b\u200d\ud83d[\udc68\udc69]|\ud83d\udc69\u200d\ud83d\udc69\u200d\ud83d\udc66\u200d\ud83d\udc66|\ud83d\udc69\u200d\ud83d\udc69\u200d\ud83d\udc67\u200d\ud83d[\udc66\udc67]|\ud83d\udc68\u200d\u2764\ufe0f\u200d\ud83d\udc68|\ud83d\udc68\u200d\ud83d\udc66\u200d\ud83d\udc66|\ud83d\udc68\u200d\ud83d\udc67\u200d\ud83d[\udc66\udc67]|\ud83d\udc68\u200d\ud83d\udc68\u200d\ud83d[\udc66\udc67]|\ud83d\udc68\u200d\ud83d\udc69\u200d\ud83d[\udc66\udc67]|\ud83d\udc69\u200d\u2764\ufe0f\u200d\ud83d[\udc68\udc69]|\ud83d\udc69\u200d\ud83d\udc66\u200d\ud83d\udc66|\ud83d\udc69\u200d\ud83d\udc67\u200d\ud83d[\udc66\udc67]|\ud83d\udc69\u200d\ud83d\udc69\u200d\ud83d[\udc66\udc67]|\ud83c\udff3\ufe0f\u200d\ud83c\udf08|\ud83c\udff4\u200d\u2620\ufe0f|\ud83d\udc15\u200d\ud83e\uddba|\ud83d\udc41\u200d\ud83d\udde8|\ud83d\udc68\u200d\ud83d[\udc66\udc67]|\ud83d\udc69\u200d\ud83d[\udc66\udc67]|\ud83d\udc6f\u200d\u2640\ufe0f|\ud83d\udc6f\u200d\u2642\ufe0f|\ud83e\udd3c\u200d\u2640\ufe0f|\ud83e\udd3c\u200d\u2642\ufe0f|\ud83e\uddde\u200d\u2640\ufe0f|\ud83e\uddde\u200d\u2642\ufe0f|\ud83e\udddf\u200d\u2640\ufe0f|\ud83e\udddf\u200d\u2642\ufe0f)|[#*0-9]\ufe0f?\u20e3|(?:[©®\u2122\u265f]\ufe0f)|(?:\ud83c[\udc04\udd70\udd71\udd7e\udd7f\ude02\ude1a\ude2f\ude37\udf21\udf24-\udf2c\udf36\udf7d\udf96\udf97\udf99-\udf9b\udf9e\udf9f\udfcd\udfce\udfd4-\udfdf\udff3\udff5\udff7]|\ud83d[\udc3f\udc41\udcfd\udd49\udd4a\udd6f\udd70\udd73\udd76-\udd79\udd87\udd8a-\udd8d\udda5\udda8\uddb1\uddb2\uddbc\uddc2-\uddc4\uddd1-\uddd3\udddc-\uddde\udde1\udde3\udde8\uddef\uddf3\uddfa\udecb\udecd-\udecf\udee0-\udee5\udee9\udef0\udef3]|[\u203c\u2049\u2139\u2194-\u2199\u21a9\u21aa\u231a\u231b\u2328\u23cf\u23ed-\u23ef\u23f1\u23f2\u23f8-\u23fa\u24c2\u25aa\u25ab\u25b6\u25c0\u25fb-\u25fe\u2600-\u2604\u260e\u2611\u2614\u2615\u2618\u2620\u2622\u2623\u2626\u262a\u262e\u262f\u2638-\u263a\u2640\u2642\u2648-\u2653\u2660\u2663\u2665\u2666\u2668\u267b\u267f\u2692-\u2697\u2699\u269b\u269c\u26a0\u26a1\u26aa\u26ab\u26b0\u26b1\u26bd\u26be\u26c4\u26c5\u26c8\u26cf\u26d1\u26d3\u26d4\u26e9\u26ea\u26f0-\u26f5\u26f8\u26fa\u26fd\u2702\u2708\u2709\u270f\u2712\u2714\u2716\u271d\u2721\u2733\u2734\u2744\u2747\u2757\u2763\u2764\u27a1\u2934\u2935\u2b05-\u2b07\u2b1b\u2b1c\u2b50\u2b55\u3030\u303d\u3297\u3299])(?:\ufe0f|(?!\ufe0e))|(?:(?:\ud83c[\udfcb\udfcc]|\ud83d[\udd74\udd75\udd90]|[\u261d\u26f7\u26f9\u270c\u270d])(?:\ufe0f|(?!\ufe0e))|(?:\ud83c[\udf85\udfc2-\udfc4\udfc7\udfca]|\ud83d[\udc42\udc43\udc46-\udc50\udc66-\udc69\udc6e\udc70-\udc78\udc7c\udc81-\udc83\udc85-\udc87\udcaa\udd7a\udd95\udd96\ude45-\ude47\ude4b-\ude4f\udea3\udeb4-\udeb6\udec0\udecc]|\ud83e[\udd0f\udd18-\udd1c\udd1e\udd1f\udd26\udd30-\udd39\udd3d\udd3e\uddb5\uddb6\uddb8\uddb9\uddbb\uddcd-\uddcf\uddd1-\udddd]|[\u270a\u270b]))(?:\ud83c[\udffb-\udfff])?|(?:\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f|\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc73\udb40\udc63\udb40\udc74\udb40\udc7f|\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc77\udb40\udc6c\udb40\udc73\udb40\udc7f|\ud83c\udde6\ud83c[\udde8-\uddec\uddee\uddf1\uddf2\uddf4\uddf6-\uddfa\uddfc\uddfd\uddff]|\ud83c\udde7\ud83c[\udde6\udde7\udde9-\uddef\uddf1-\uddf4\uddf6-\uddf9\uddfb\uddfc\uddfe\uddff]|\ud83c\udde8\ud83c[\udde6\udde8\udde9\uddeb-\uddee\uddf0-\uddf5\uddf7\uddfa-\uddff]|\ud83c\udde9\ud83c[\uddea\uddec\uddef\uddf0\uddf2\uddf4\uddff]|\ud83c\uddea\ud83c[\udde6\udde8\uddea\uddec\udded\uddf7-\uddfa]|\ud83c\uddeb\ud83c[\uddee-\uddf0\uddf2\uddf4\uddf7]|\ud83c\uddec\ud83c[\udde6\udde7\udde9-\uddee\uddf1-\uddf3\uddf5-\uddfa\uddfc\uddfe]|\ud83c\udded\ud83c[\uddf0\uddf2\uddf3\uddf7\uddf9\uddfa]|\ud83c\uddee\ud83c[\udde8-\uddea\uddf1-\uddf4\uddf6-\uddf9]|\ud83c\uddef\ud83c[\uddea\uddf2\uddf4\uddf5]|\ud83c\uddf0\ud83c[\uddea\uddec-\uddee\uddf2\uddf3\uddf5\uddf7\uddfc\uddfe\uddff]|\ud83c\uddf1\ud83c[\udde6-\udde8\uddee\uddf0\uddf7-\uddfb\uddfe]|\ud83c\uddf2\ud83c[\udde6\udde8-\udded\uddf0-\uddff]|\ud83c\uddf3\ud83c[\udde6\udde8\uddea-\uddec\uddee\uddf1\uddf4\uddf5\uddf7\uddfa\uddff]|\ud83c\uddf4\ud83c\uddf2|\ud83c\uddf5\ud83c[\udde6\uddea-\udded\uddf0-\uddf3\uddf7-\uddf9\uddfc\uddfe]|\ud83c\uddf6\ud83c\udde6|\ud83c\uddf7\ud83c[\uddea\uddf4\uddf8\uddfa\uddfc]|\ud83c\uddf8\ud83c[\udde6-\uddea\uddec-\uddf4\uddf7-\uddf9\uddfb\uddfd-\uddff]|\ud83c\uddf9\ud83c[\udde6\udde8\udde9\uddeb-\udded\uddef-\uddf4\uddf7\uddf9\uddfb\uddfc\uddff]|\ud83c\uddfa\ud83c[\udde6\uddec\uddf2\uddf3\uddf8\uddfe\uddff]|\ud83c\uddfb\ud83c[\udde6\udde8\uddea\uddec\uddee\uddf3\uddfa]|\ud83c\uddfc\ud83c[\uddeb\uddf8]|\ud83c\uddfd\ud83c\uddf0|\ud83c\uddfe\ud83c[\uddea\uddf9]|\ud83c\uddff\ud83c[\udde6\uddf2\uddfc]|\ud83c[\udccf\udd8e\udd91-\udd9a\udde6-\uddff\ude01\ude32-\ude36\ude38-\ude3a\ude50\ude51\udf00-\udf20\udf2d-\udf35\udf37-\udf7c\udf7e-\udf84\udf86-\udf93\udfa0-\udfc1\udfc5\udfc6\udfc8\udfc9\udfcf-\udfd3\udfe0-\udff0\udff4\udff8-\udfff]|\ud83d[\udc00-\udc3e\udc40\udc44\udc45\udc51-\udc65\udc6a-\udc6d\udc6f\udc79-\udc7b\udc7d-\udc80\udc84\udc88-\udca9\udcab-\udcfc\udcff-\udd3d\udd4b-\udd4e\udd50-\udd67\udda4\uddfb-\ude44\ude48-\ude4a\ude80-\udea2\udea4-\udeb3\udeb7-\udebf\udec1-\udec5\uded0-\uded2\uded5\udeeb\udeec\udef4-\udefa\udfe0-\udfeb]|\ud83e[\udd0d\udd0e\udd10-\udd17\udd1d\udd20-\udd25\udd27-\udd2f\udd3a\udd3c\udd3f-\udd45\udd47-\udd71\udd73-\udd76\udd7a-\udda2\udda5-\uddaa\uddae-\uddb4\uddb7\uddba\uddbc-\uddca\uddd0\uddde-\uddff\ude70-\ude73\ude78-\ude7a\ude80-\ude82\ude90-\ude95]|[\u23e9-\u23ec\u23f0\u23f3\u267e\u26ce\u2705\u2728\u274c\u274e\u2753-\u2755\u2795-\u2797\u27b0\u27bf\ue50a])|\ufe0f");
38
39         /// <summary>
40         /// テキストから URL を抽出して返します
41         /// </summary>
42         public static IEnumerable<string> ExtractUrls(string text)
43             => ExtractUrlEntities(text).Select(x => x.Url);
44
45         /// <summary>
46         /// テキストから URL を抽出してエンティティとして返します
47         /// </summary>
48         public static IEnumerable<TwitterEntityUrl> ExtractUrlEntities(string text)
49         {
50             var urlMatches = Regex.Matches(text, Twitter.rgUrl, RegexOptions.IgnoreCase).Cast<Match>();
51             foreach (var m in urlMatches)
52             {
53                 var before = m.Groups["before"].Value;
54                 var url = m.Groups["url"].Value;
55                 var protocol = m.Groups["protocol"].Value;
56                 var domain = m.Groups["domain"].Value;
57                 var path = m.Groups["path"].Value;
58
59                 var validUrl = false;
60                 if (protocol.Length == 0)
61                 {
62                     if (Regex.IsMatch(before, Twitter.url_invalid_without_protocol_preceding_chars))
63                         continue;
64
65                     string? lasturl = null;
66
67                     var last_url_invalid_match = false;
68                     var domainMatches = Regex.Matches(domain, Twitter.url_valid_ascii_domain, RegexOptions.IgnoreCase).Cast<Match>();
69                     foreach (var mm in domainMatches)
70                     {
71                         lasturl = mm.Value;
72                         last_url_invalid_match = Regex.IsMatch(lasturl, Twitter.url_invalid_short_domain, RegexOptions.IgnoreCase);
73                         if (!last_url_invalid_match)
74                         {
75                             validUrl = true;
76                         }
77                     }
78
79                     if (last_url_invalid_match && path.Length != 0)
80                     {
81                         validUrl = true;
82                     }
83                 }
84                 else
85                 {
86                     validUrl = true;
87                 }
88
89                 if (validUrl)
90                 {
91                     var urlGroup = m.Groups["url"];
92                     var startPos = text.GetCodepointCount(0, urlGroup.Index);
93                     var endPos = startPos + text.GetCodepointCount(urlGroup.Index, urlGroup.Index + urlGroup.Length);
94
95                     yield return new TwitterEntityUrl
96                     {
97                         Indices = new[] { startPos, endPos },
98                         Url = url,
99                         ExpandedUrl = url,
100                         DisplayUrl = url,
101                     };
102                 }
103             }
104         }
105
106         /// <summary>
107         /// テキストからメンションを抽出してエンティティとして返します
108         /// </summary>
109         public static IEnumerable<TwitterEntityMention> ExtractMentionEntities(string text)
110         {
111             // リスト
112             var matchesAtList = Regex.Matches(text, @"(?<=^|[^a-zA-Z0-9_/])([@@][a-zA-Z0-9_]{1,20}/[a-zA-Z][a-zA-Z0-9\p{IsLatin-1Supplement}\-]{0,79})");
113             foreach (var match in matchesAtList.Cast<Match>())
114             {
115                 var startPos = text.GetCodepointCount(0, match.Index);
116                 var endPos = startPos + text.GetCodepointCount(match.Index, match.Index + match.Length);
117
118                 yield return new TwitterEntityMention
119                 {
120                     Indices = new[] { startPos, endPos },
121                     ScreenName = match.Value.Substring(1), // 先頭の「@」は取り除く
122                 };
123             }
124
125             // 通常のメンション
126             var matchesAtUser = Regex.Matches(text, "(?<=^|[^a-zA-Z0-9_/])([@@][a-zA-Z0-9_]{1,20})(?=[^a-zA-Z0-9_/]|$)");
127             foreach (var match in matchesAtUser.Cast<Match>())
128             {
129                 var startPos = text.GetCodepointCount(0, match.Index);
130                 var endPos = startPos + text.GetCodepointCount(match.Index, match.Index + match.Length);
131
132                 yield return new TwitterEntityMention
133                 {
134                     Indices = new[] { startPos, endPos },
135                     ScreenName = match.Value.Substring(1), // 先頭の「@」は取り除く
136                 };
137             }
138         }
139
140         /// <summary>
141         /// テキストからハッシュタグを抽出してエンティティとして返します
142         /// </summary>
143         public static IEnumerable<TwitterEntityHashtag> ExtractHashtagEntities(string text)
144         {
145             var matches = Regex.Matches(text, Twitter.HASHTAG);
146             foreach (var match in matches.Cast<Match>())
147             {
148                 var groupHashtagSharp = match.Groups[2];
149                 var groupHashtagText = match.Groups[3];
150                 var startPos = text.GetCodepointCount(0, groupHashtagSharp.Index);
151                 var endPos = startPos + text.GetCodepointCount(groupHashtagSharp.Index, groupHashtagSharp.Index + groupHashtagSharp.Length + groupHashtagText.Length);
152
153                 yield return new TwitterEntityHashtag
154                 {
155                     Indices = new[] { startPos, endPos },
156                     Text = groupHashtagText.Value,
157                 };
158             }
159         }
160
161         /// <summary>
162         /// テキストから絵文字を抽出して <see cref="TwitterEntityEmoji"/> として返します
163         /// </summary>
164         public static IEnumerable<TwitterEntityEmoji> ExtractEmojiEntities(string text)
165         {
166             var matches = TweetExtractor.EmojiPattern.Matches(text);
167
168             foreach (var match in matches.Cast<Match>())
169             {
170                 var input = match.Value;
171
172                 // 異字体セレクタ U+FE0F (emoji style) は除去する (ZWJ を含まない場合のみ)
173                 if (!input.Contains('\u200D'))
174                     input = input.Replace("\uFE0F", "");
175
176                 var codepointHex = new List<string>();
177
178                 for (var i = 0; i < input.Length; i += char.IsSurrogatePair(input, i) ? 2 : 1)
179                 {
180                     var codepoint = char.ConvertToUtf32(input, i);
181                     codepointHex.Add($"{codepoint:x}");
182                 }
183
184                 var startPos = text.GetCodepointCount(0, match.Index);
185                 var endPos = startPos + text.GetCodepointCount(match.Index, match.Index + match.Length);
186
187                 TwitterEntityEmoji entity;
188                 if (codepointHex.Count >= 1)
189                 {
190                     entity = new TwitterEntityEmoji
191                     {
192                         Indices = new[] { startPos, endPos },
193                         Text = input,
194                         Url = "https://twemoji.maxcdn.com/2/72x72/" + string.Join("-", codepointHex) + ".png",
195                     };
196                 }
197                 else
198                 {
199                     entity = new TwitterEntityEmoji
200                     {
201                         Indices = new[] { startPos, endPos },
202                         Text = input,
203                         Url = "",
204                     };
205                 }
206
207                 yield return entity;
208             }
209         }
210     }
211 }