3 * @author Mike Cochrane <mikec@mikenz.geek.nz>
4 * @author Nick Pope <nick@nickpope.me.uk>
5 * @copyright Copyright © 2010, Mike Cochrane, Nick Pope
6 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
10 require_once 'Regex.php';
13 * Twitter Extractor Class
15 * Parses tweets and extracts URLs, usernames, username/list pairs and
18 * Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
19 * is based on code by {@link http://github.com/mzsanford Matt Sanford} and
20 * heavily modified by {@link http://github.com/ngnpope Nick Pope}.
22 * @author Mike Cochrane <mikec@mikenz.geek.nz>
23 * @author Nick Pope <nick@nickpope.me.uk>
24 * @copyright Copyright © 2010, Mike Cochrane, Nick Pope
25 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
28 class Twitter_Extractor extends Twitter_Regex {
31 * Provides fluent method chaining.
33 * @param string $tweet The tweet to be converted.
37 * @return Twitter_Extractor
39 public static function create($tweet) {
40 return new self($tweet);
44 * Reads in a tweet to be parsed and extracts elements from it.
46 * Extracts various parts of a tweet including URLs, usernames, hashtags...
48 * @param string $tweet The tweet to extract.
50 public function __construct($tweet) {
51 parent::__construct($tweet);
55 * Extracts all parts of a tweet and returns an associative array containing
56 * the extracted elements.
58 * @return array The elements in the tweet.
60 public function extract() {
62 'hashtags' => $this->extractHashtags(),
63 'urls' => $this->extractURLs(),
64 'mentions' => $this->extractMentionedUsernames(),
65 'replyto' => $this->extractRepliedUsernames(),
66 'hashtags_with_indices' => $this->extractHashtagsWithIndices(),
67 'urls_with_indices' => $this->extractURLsWithIndices(),
68 'mentions_with_indices' => $this->extractMentionedUsernamesWithIndices(),
73 * Extracts all the hashtags from the tweet.
75 * @return array The hashtag elements in the tweet.
77 public function extractHashtags() {
78 preg_match_all(self::REGEX_HASHTAG, $this->tweet, $matches);
83 * Extracts all the URLs from the tweet.
85 * @return array The URL elements in the tweet.
87 public function extractURLs() {
88 preg_match_all(self::$REGEX_VALID_URL, $this->tweet, $matches);
89 list($all, $before, $url, $protocol, $domain, $path, $query) = array_pad($matches, 7, '');
91 for (; $i >= 0; $i--) {
92 if (!preg_match('!https?://!', $protocol[$i])) {
93 # Note: $protocol can contain 'www.' if no protocol exists!
94 if (preg_match(self::REGEX_PROBABLE_TLD, $domain[$i]) || strtolower($protocol[$i]) === 'www.') {
95 $url[$i] = 'http://'.(strtolower($protocol[$i]) === 'www.' ? $protocol[$i] : '').$domain[$i];
101 # Renumber the array:
102 return array_values($url);
106 * Extract all the usernames from the tweet.
108 * A mention is an occurrence of a username anywhere in a tweet.
110 * @return array The usernames elements in the tweet.
112 public function extractMentionedUsernames() {
113 preg_match_all(self::REGEX_USERNAME_MENTION, $this->tweet, $matches);
114 list($all, $before, $username, $after) = array_pad($matches, 4, '');
115 $usernames = array();
116 for ($i = 0; $i < count($username); $i ++) {
117 # If $after is not empty, there is an invalid character.
118 if (!empty($after[$i])) continue;
119 array_push($usernames, $username[$i]);
125 * Extract all the usernames replied to from the tweet.
127 * A reply is an occurrence of a username at the beginning of a tweet.
129 * @return array The usernames replied to in a tweet.
131 public function extractRepliedUsernames() {
132 preg_match(self::$REGEX_REPLY_USERNAME, $this->tweet, $matches);
133 return isset($matches[2]) ? $matches[2] : '';
137 * Extracts all the hashtags and the indices they occur at from the tweet.
139 * @return array The hashtag elements in the tweet.
141 public function extractHashtagsWithIndices() {
142 preg_match_all(self::REGEX_HASHTAG, $this->tweet, $matches, PREG_OFFSET_CAPTURE);
144 for ($i = 0; $i < count($m); $i++) {
145 $m[$i] = array_combine(array('hashtag', 'indices'), $m[$i]);
146 # XXX: Fix for PREG_OFFSET_CAPTURE returning byte offsets...
147 $start = mb_strlen(substr($this->tweet, 0, $matches[1][$i][1]));
148 $start += mb_strlen($matches[1][$i][0]);
149 $length = mb_strlen($m[$i]['hashtag']);
150 $m[$i]['indices'] = array($start, $start + $length + 1);
156 * Extracts all the URLs and the indices they occur at from the tweet.
158 * @return array The URLs elements in the tweet.
160 public function extractURLsWithIndices() {
161 preg_match_all(self::$REGEX_VALID_URL, $this->tweet, $matches, PREG_OFFSET_CAPTURE);
163 for ($i = 0; $i < count($m); $i++) {
164 $m[$i] = array_combine(array('url', 'indices'), $m[$i]);
165 # XXX: Fix for PREG_OFFSET_CAPTURE returning byte offsets...
166 $start = mb_strlen(substr($this->tweet, 0, $matches[1][$i][1]));
167 $start += mb_strlen($matches[1][$i][0]);
168 $length = mb_strlen($m[$i]['url']);
169 $m[$i]['indices'] = array($start, $start + $length);
175 * Extracts all the usernames and the indices they occur at from the tweet.
177 * @return array The username elements in the tweet.
179 public function extractMentionedUsernamesWithIndices() {
180 preg_match_all(self::REGEX_USERNAME_MENTION, $this->tweet, $matches, PREG_OFFSET_CAPTURE);
182 for ($i = 0; $i < count($m); $i++) {
183 $m[$i] = array_combine(array('screen_name', 'indices'), $m[$i]);
184 # XXX: Fix for PREG_OFFSET_CAPTURE returning byte offsets...
185 $start = mb_strlen(substr($this->tweet, 0, $matches[1][$i][1]));
186 $start += mb_strlen($matches[1][$i][0]);
187 $length = mb_strlen($m[$i]['screen_name']);
188 $m[$i]['indices'] = array($start, $start + $length + 1);