twitter2rss.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Author:: drymer <drymer [ EN ] autistici.org>
  5. # Copyright:: Copyright (c) 2016, drymer
  6. #
  7. # This program is free software: you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation, either version 2 of the License, or (at
  10. # your option) any later version.
  11. #
  12. # This program is distributed in the hope that it will be useful, but
  13. # WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. # General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License
  18. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  19. #
  20. from html.parser import HTMLParser
  21. from sys import argv
  22. from threading import Thread
  23. from queue import Queue
  24. import datetime
  25. import PyRSS2Gen
  26. import hashlib
  27. import requests
  28. import sys
  29. import os
  30. class twitterParser(HTMLParser):
  31. """HTMLParser class object."""
  32. def __init__(self):
  33. """
  34. Inicialize __init__ class with some variables used later
  35. declared variables:
  36. self.recording -- int that controls if there's data recording
  37. self.data -- list containing the parsed HTML
  38. self.attributes -- list containing HTML tag's attributes
  39. self.tempData -- temporal list containing parsed HTML
  40. self.id -- string containin tag information for differentiating
  41. """
  42. # Support por python3.2, but it may be incosistent
  43. try:
  44. HTMLParser.__init__(self, convert_charrefs=True)
  45. except:
  46. HTMLParser.__init__(self)
  47. self.recording = 0
  48. self.data = []
  49. self.attributes = []
  50. self.tempData = []
  51. self.id = ''
  52. def handle_starttag(self, tag, attrs):
  53. """
  54. Identify when the tags of interest begins and start recording the data.
  55. return -- just a way for breake the loop
  56. """
  57. self.tag = tag
  58. if self.tag not in ['p', 'span', 'img']:
  59. return
  60. elif self.recording:
  61. self.recording += 1
  62. return
  63. # Key to find the tweets and identify if they are retweets.
  64. # It is likely to change over time, as Twitter do the same
  65. for name, value in attrs:
  66. if self.tag == 'p' and name == 'class' \
  67. and 'TweetTextSize TweetTextSize' in value:
  68. self.recording += 1
  69. self.attributes += attrs
  70. self.id = 'p'
  71. break
  72. elif self.tag == 'span' and name == 'class' and 'js-retweet-text' \
  73. in value:
  74. self.recording += 1
  75. self.attributes += attrs
  76. self.id = 'span'
  77. break
  78. elif self.tag == 'img' and name == 'class' and \
  79. 'Emoji Emoji--forText' in value:
  80. self.recording += 1
  81. self.attributes += attrs
  82. self.id = 'p'
  83. break
  84. else:
  85. return
  86. def handle_endtag(self, tag):
  87. """Identify when the tags of interest ends and stop recording data."""
  88. self.tag = tag
  89. if tag == 'p' and self.recording:
  90. self.recording -= 1
  91. elif tag == 'span' and self.recording:
  92. self.recording -= 1
  93. elif tag == 'img' and self.id == 'p' and self.recording:
  94. self.recording -= 1
  95. def handle_data(self, data):
  96. """When recording, save the data."""
  97. if self.recording:
  98. self.tempData.append(data)
  99. elif self.tempData != []:
  100. if self.id == 'p':
  101. self.data.append(self.tempData)
  102. self.tempData = []
  103. elif self.id == 'span':
  104. # Random hash to identify retweets
  105. self.tempData += [' 59bcc3ad6775562f845953cf01624225']
  106. self.data.append(self.tempData)
  107. self.tempData = []
  108. def return_value(self):
  109. """
  110. Return all saved data.
  111. return -- list of list of chopped strings
  112. """
  113. return self.data
  114. def retrieve_html(url):
  115. """
  116. Retrieve HTML code from url.
  117. url -- string containing an url to be retrieved
  118. return -- string containing HTML code or nothing if there's an error
  119. """
  120. try:
  121. code = requests.get(url).text
  122. except:
  123. return
  124. return code
  125. def sanitize(tweet):
  126. """
  127. Sanitize data. Tweet is a list of chopped up strings that need to be
  128. reassembled. Also, it takes out some weird chars.
  129. tweet -- list containing chopped strings with the data
  130. return -- string containing sanitized tweet
  131. """
  132. final = ''
  133. counter = 0
  134. errors = ['…', '\xa0']
  135. for part in tweet:
  136. if part not in errors:
  137. try:
  138. if 'https://' in part:
  139. final += ' '
  140. elif 'http://' in part:
  141. final += ' '
  142. elif 'pic.twitter.com/' in part:
  143. final += ' '
  144. except:
  145. pass
  146. final += part
  147. counter += 1
  148. if final:
  149. return final
  150. def create_feed(user, feeds):
  151. """
  152. Create feed file.
  153. user -- string containing twitter's username
  154. feeds -- list containing tweets
  155. """
  156. user = user.strip()
  157. items = []
  158. for feed in feeds:
  159. i = 0
  160. limite = 5
  161. cuatro_primeras = ''
  162. split = feed.split()
  163. if len(split) <= 5:
  164. limite = len(split)
  165. for i in range(0, limite):
  166. cuatro_primeras += split[i] + ' '
  167. i += 1
  168. # GUID specified to improve feed readers reading
  169. guid = hashlib.sha1(cuatro_primeras.encode()).hexdigest()
  170. item = PyRSS2Gen.RSSItem(
  171. title='@' + user + ' says: ' + cuatro_primeras + '...',
  172. link='https://twitter.com/' + user,
  173. description=feed,
  174. guid=PyRSS2Gen.Guid(guid, isPermaLink=False)
  175. )
  176. items.append(item)
  177. rss = PyRSS2Gen.RSS2(
  178. title='@' + user + ' Twitter\'s feed.',
  179. link='https://twitter.com/' + user,
  180. description='@' + user + ' Twitter\'s feed.',
  181. lastBuildDate=datetime.datetime.now(),
  182. items=items
  183. )
  184. rss.write_xml(open("feeds/" + user + ".xml", "w"), encoding='utf-8')
  185. def slave():
  186. """
  187. It creates threads and executes the __main__ part according to the
  188. 'threads' variable defined in __main__.
  189. """
  190. while True:
  191. tweets = []
  192. user = repr(q.get())
  193. user = user.replace("\\n", "")
  194. user = user.replace("'", "")
  195. code = retrieve_html('https://twitter.com/' + user + '?lang=en')
  196. if code == "":
  197. q.task_done()
  198. break
  199. parser = twitterParser()
  200. parser.feed(code)
  201. data = parser.return_value()
  202. for tweet in data:
  203. tweet = sanitize(tweet)
  204. tweets.append(tweet)
  205. tweets = mark_as_retweet(tweets)
  206. create_feed(user, tweets)
  207. q.task_done()
  208. def mark_as_retweet(tweets):
  209. """
  210. Mark tweet as retweet seeking a concrete number.
  211. tweets -- list of strings containing sanitized tweets
  212. return -- list of strings maked as retweets with the '♻' symbol
  213. """
  214. coincidence = []
  215. for num in enumerate(tweets):
  216. if '59bcc3ad6775562f845953cf01624225' in num[1]:
  217. coincidence.append(num[0])
  218. for coinc in coincidence:
  219. if coinc + 1 < len(tweets):
  220. tweets[coinc+1] = '♻' + tweets[coinc+1]
  221. for coinc in reversed(coincidence):
  222. tweets.pop(coinc)
  223. return tweets
  224. if __name__ == "__main__":
  225. # This variable can be modified
  226. threads = 2
  227. q = Queue()
  228. if not os.path.exists('feeds'):
  229. os.mkdir(("feeds"))
  230. for i in range(threads):
  231. t = Thread(target=slave)
  232. t.daemon = True
  233. t.start()
  234. if len(argv) == 2:
  235. user = argv[1]
  236. q.put(user)
  237. # block the end of the program until all threads are finished
  238. q.join()
  239. else:
  240. try:
  241. feed_file = open('twitter_users', 'r')
  242. except:
  243. print("The file twitter_users does not exist."
  244. " You must create it to continue.")
  245. sys.exit()
  246. feed = feed_file.readlines()
  247. feed_file.close()
  248. for user in feed:
  249. q.put(user)
  250. # block the end of the program until all threads are finished
  251. q.join()