drymer
/
twitter2rss
réplica de http://daemons.cf/cgit/twitter2rss


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Author:: drymer <drymer [ EN ] autistici.org>
# Copyright:: Copyright (c) 2016, drymer
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

from html.parser import HTMLParser
from sys import argv
from threading import Thread
from queue import Queue
import datetime
import PyRSS2Gen
import hashlib
import requests
import sys
import os


class twitterParser(HTMLParser):
    """HTMLParser class object."""
    def __init__(self):
        """

        Inicialize __init__ class with some variables used later


        declared variables:

        self.recording -- int that controls if there's data recording

        self.data -- list containing the parsed HTML

        self.attributes -- list containing HTML tag's attributes

        self.tempData -- temporal list containing parsed HTML

        self.id -- string containin tag information for differentiating

        """

        # Support por python3.2, but it may be incosistent
        try:
            HTMLParser.__init__(self, convert_charrefs=True)
        except:
            HTMLParser.__init__(self)
        self.recording = 0
        self.data = []
        self.attributes = []
        self.tempData = []
        self.id = ''

    def handle_starttag(self, tag, attrs):
        """

        Identify when the tags of interest begins and start recording the data.


        return -- just a way for breake the loop

        """

        self.tag = tag
        if self.tag not in ['p', 'span', 'img']:
            return

        elif self.recording:
            self.recording += 1
            return

        # Key to find the tweets and identify if they are retweets.
        # It is likely to change over time, as Twitter do the same
        for name, value in attrs:
            if self.tag == 'p' and name == 'class' \
              and 'TweetTextSize TweetTextSize' in value:
                self.recording += 1
                self.attributes += attrs
                self.id = 'p'
                break
            elif self.tag == 'span' and name == 'class' and 'js-retweet-text' \
               in value:
                self.recording += 1
                self.attributes += attrs
                self.id = 'span'
                break
            elif self.tag == 'img' and name == 'class' and \
              'Emoji Emoji--forText' in value:
                self.recording += 1
                self.attributes += attrs
                self.id = 'p'
                break
            else:
                return

    def handle_endtag(self, tag):
        """Identify when the tags of interest ends and stop recording data."""
        self.tag = tag

        if tag == 'p' and self.recording:
            self.recording -= 1
        elif tag == 'span' and self.recording:
            self.recording -= 1
        elif tag == 'img' and self.id == 'p' and self.recording:
            self.recording -= 1

    def handle_data(self, data):
        """When recording, save the data."""

        if self.recording:
            self.tempData.append(data)

        elif self.tempData != []:
            if self.id == 'p':
                self.data.append(self.tempData)
                self.tempData = []

            elif self.id == 'span':
                # Random hash to identify retweets
                self.tempData += [' 59bcc3ad6775562f845953cf01624225']
                self.data.append(self.tempData)
                self.tempData = []

    def return_value(self):
        """

        Return all saved data.


        return -- list of list of chopped strings

        """
        return self.data


def retrieve_html(url):
    """

    Retrieve HTML code from url.


    url -- string containing an url to be retrieved

    return -- string containing HTML code or nothing if there's an error

    """
    try:
        code = requests.get(url).text
    except:
        return
    return code


def sanitize(tweet):
    """

    Sanitize data. Tweet is a list of chopped up strings that need to be

    reassembled. Also, it takes out some weird chars.


    tweet -- list containing chopped strings with the data

    return -- string containing sanitized tweet

    """
    final = ''
    counter = 0
    errors = ['…', '\xa0']

    for part in tweet:
        if part not in errors:
            try:
                if 'https://' in part:
                    final += ' '
                elif 'http://' in part:
                    final += ' '
                elif 'pic.twitter.com/' in part:
                    final += ' '
            except:
                pass
            final += part
        counter += 1

    if final:
        return final


def create_feed(user, feeds):
    """

    Create feed file.


    user -- string containing twitter's username

    feeds -- list containing tweets

    """
    user = user.strip()
    items = []

    for feed in feeds:
        i = 0
        limite = 5
        cuatro_primeras = ''
        split = feed.split()

        if len(split) <= 5:
            limite = len(split)

        for i in range(0, limite):
            cuatro_primeras += split[i] + ' '
            i += 1
        # GUID specified to improve feed readers reading
        guid = hashlib.sha1(cuatro_primeras.encode()).hexdigest()
        item = PyRSS2Gen.RSSItem(
            title='@' + user + ' says: ' + cuatro_primeras + '...',
            link='https://twitter.com/' + user,
            description=feed,
            guid=PyRSS2Gen.Guid(guid, isPermaLink=False)
        )
        items.append(item)

    rss = PyRSS2Gen.RSS2(
        title='@' + user + ' Twitter\'s feed.',
        link='https://twitter.com/' + user,
        description='@' + user + ' Twitter\'s feed.',
        lastBuildDate=datetime.datetime.now(),
        items=items
    )

    rss.write_xml(open("feeds/" + user + ".xml", "w"), encoding='utf-8')


def slave():
    """

    It creates threads and executes the __main__ part according to the

    'threads' variable defined in __main__.

    """

    while True:
        tweets = []
        user = repr(q.get())
        user = user.replace("\\n", "")
        user = user.replace("'", "")

        code = retrieve_html('https://twitter.com/' + user + '?lang=en')

        if code == "":
            q.task_done()
            break
        parser = twitterParser()
        parser.feed(code)
        data = parser.return_value()

        for tweet in data:
            tweet = sanitize(tweet)
            tweets.append(tweet)
        tweets = mark_as_retweet(tweets)
        create_feed(user, tweets)
        q.task_done()


def mark_as_retweet(tweets):
    """

    Mark tweet as retweet seeking a concrete number.


    tweets -- list of strings containing sanitized tweets

    return -- list of strings maked as retweets with the '♻' symbol

    """

    coincidence = []
    for num in enumerate(tweets):
        if '59bcc3ad6775562f845953cf01624225' in num[1]:
            coincidence.append(num[0])
    for coinc in coincidence:
        if coinc + 1 < len(tweets):
            tweets[coinc+1] = '♻' + tweets[coinc+1]
    for coinc in reversed(coincidence):
        tweets.pop(coinc)

    return tweets

if __name__ == "__main__":

    # This variable can be modified
    threads = 2
    q = Queue()

    if not os.path.exists('feeds'):
        os.mkdir(("feeds"))

    for i in range(threads):
        t = Thread(target=slave)
        t.daemon = True
        t.start()

    if len(argv) == 2:
        user = argv[1]
        q.put(user)
        # block the end of the program until all threads are finished
        q.join()

    else:
        try:
            feed_file = open('twitter_users', 'r')
        except:
            print("The file twitter_users does not exist."
                  " You must create it to continue.")
            sys.exit()

        feed = feed_file.readlines()
        feed_file.close()

        for user in feed:
            q.put(user)
        # block the end of the program until all threads are finished
        q.join()