OSDN Git Service

Merge branch 'RetweetDecode'
[opentween/open-tween.git] / OpenTween / TweetExtractor.cs
1 // OpenTween - Client of Twitter
2 // Copyright (c) 2015 kim_upsilon (@kim_upsilon) <https://upsilo.net/~upsilon/>
3 // All rights reserved.
4 //
5 // This file is part of OpenTween.
6 //
7 // This program is free software; you can redistribute it and/or modify it
8 // under the terms of the GNU General Public License as published by the Free
9 // Software Foundation; either version 3 of the License, or (at your option)
10 // any later version.
11 //
12 // This program is distributed in the hope that it will be useful, but
13 // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 // for more details.
16 //
17 // You should have received a copy of the GNU General Public License along
18 // with this program. If not, see <http://www.gnu.org/licenses/>, or write to
19 // the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
20 // Boston, MA 02110-1301, USA.
21
22 using System;
23 using System.Collections.Generic;
24 using System.Linq;
25 using System.Text;
26 using System.Text.RegularExpressions;
27 using System.Threading.Tasks;
28 using OpenTween.Api;
29
30 namespace OpenTween
31 {
32     public static class TweetExtractor
33     {
34         /// <summary>
35         /// テキストから URL を抽出して返します
36         /// </summary>
37         public static IEnumerable<string> ExtractUrls(string text)
38         {
39             return ExtractUrlEntities(text).Select(x => x.Url);
40         }
41
42         /// <summary>
43         /// テキストから URL を抽出してエンティティとして返します
44         /// </summary>
45         public static IEnumerable<TwitterEntityUrl> ExtractUrlEntities(string text)
46         {
47             var urlMatches = Regex.Matches(text, Twitter.rgUrl, RegexOptions.IgnoreCase).Cast<Match>();
48             foreach (var m in urlMatches)
49             {
50                 var before = m.Groups["before"].Value;
51                 var url = m.Groups["url"].Value;
52                 var protocol = m.Groups["protocol"].Value;
53                 var domain = m.Groups["domain"].Value;
54                 var path = m.Groups["path"].Value;
55
56                 var validUrl = false;
57                 if (protocol.Length == 0)
58                 {
59                     if (Regex.IsMatch(before, Twitter.url_invalid_without_protocol_preceding_chars))
60                         continue;
61
62                     string lasturl = null;
63
64                     var last_url_invalid_match = false;
65                     var domainMatches = Regex.Matches(domain, Twitter.url_valid_ascii_domain, RegexOptions.IgnoreCase).Cast<Match>();
66                     foreach (var mm in domainMatches)
67                     {
68                         lasturl = mm.Value;
69                         last_url_invalid_match = Regex.IsMatch(lasturl, Twitter.url_invalid_short_domain, RegexOptions.IgnoreCase);
70                         if (!last_url_invalid_match)
71                         {
72                             validUrl = true;
73                         }
74                     }
75
76                     if (last_url_invalid_match && path.Length != 0)
77                     {
78                         validUrl = true;
79                     }
80                 }
81                 else
82                 {
83                     validUrl = true;
84                 }
85
86                 if (validUrl)
87                 {
88                     var startPos = m.Groups["url"].Index;
89                     var endPos = startPos + m.Groups["url"].Length;
90
91                     yield return new TwitterEntityUrl
92                     {
93                         Indices = new[] { startPos, endPos },
94                         Url = url,
95                         ExpandedUrl = url,
96                         DisplayUrl = url,
97                     };
98                 }
99             }
100         }
101
102         /// <summary>
103         /// テキストからメンションを抽出してエンティティとして返します
104         /// </summary>
105         public static IEnumerable<TwitterEntityMention> ExtractMentionEntities(string text)
106         {
107             // リスト
108             var matchesAtList = Regex.Matches(text, @"(^|[^a-zA-Z0-9_/])([@@][a-zA-Z0-9_]{1,20}/[a-zA-Z][a-zA-Z0-9\p{IsLatin-1Supplement}\-]{0,79})");
109             foreach (var match in matchesAtList.Cast<Match>())
110             {
111                 var groupMention = match.Groups[2];
112                 var startPos = groupMention.Index;
113                 var endPos = startPos + groupMention.Length;
114
115                 yield return new TwitterEntityMention
116                 {
117                     Indices = new[] { startPos, endPos },
118                     ScreenName = groupMention.Value,
119                 };
120             }
121
122             // 通常のメンション
123             var matchesAtUser = Regex.Matches(text, "(^|[^a-zA-Z0-9_/])([@@][a-zA-Z0-9_]{1,20})([^a-zA-Z0-9_/]|$)");
124             foreach (var match in matchesAtUser.Cast<Match>())
125             {
126                 var groupMention = match.Groups[2];
127                 var startPos = groupMention.Index;
128                 var endPos = startPos + groupMention.Length;
129
130                 yield return new TwitterEntityMention
131                 {
132                     Indices = new[] { startPos, endPos },
133                     ScreenName = groupMention.Value,
134                 };
135             }
136         }
137
138         /// <summary>
139         /// テキストからハッシュタグを抽出してエンティティとして返します
140         /// </summary>
141         public static IEnumerable<TwitterEntityHashtag> ExtractHashtagEntities(string text)
142         {
143             var matches = Regex.Matches(text, Twitter.HASHTAG);
144             foreach (var match in matches.Cast<Match>())
145             {
146                 var groupHashtagSharp = match.Groups[2];
147                 var groupHashtagText = match.Groups[3];
148                 var startPos = groupHashtagSharp.Index;
149                 var endPos = startPos + groupHashtagSharp.Length + groupHashtagText.Length;
150
151                 yield return new TwitterEntityHashtag
152                 {
153                     Indices = new[] { startPos, endPos },
154                     Text = groupHashtagSharp.Value + groupHashtagText.Value,
155                 };
156             }
157         }
158     }
159 }