txtsd
/
fb2vcard


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
							#!/usr/bin/env python

import re
import html
import time
import calendar
import json
from pprint import pprint
from pathlib import Path

import requests
from bs4 import BeautifulSoup as bs


LINK_MAIN = 'https://m.facebook.com'
LINK_LOGIN = 'https://m.facebook.com/login/device-based/regular/login/?refsrc=https://m.facebook.com/&lwv=100&refid=8'
LINK_FRIENDS = 'https://m.facebook.com/friends/center/friends/'
LINK_LANGUAGE = 'https://m.facebook.com/a/language.php?l=en_US&lref=%2Fsettings%2Flanguage%2F&sref=legacy_mobile_settings&gfid=AQADZjSUoWMlr7lH'

PATTERN_HOVERCARD_NAME = re.compile(r'<a class="b[qo]" href="(?P<link>.+?)">(?P<name>.+?)</a>')
PATTERN_SEEMORE = re.compile(r'[0ki]"><a href="(.+?)"><span>See More</span></a>')
PATTERN_PROFILE = re.compile(r'<div class="(?:bc|x)"><a href="(.+?)"')
PATTERN_DEACTIVATED = re.compile(r'This account has been deactivated.')
PATTERN_ABOUT = re.compile(r'[dgl]"><a href="(.+?)" class="\w\w">About</a>')
PATTERN_RATELIMIT = re.compile(r'We limit how often you can post, comment or do other things in a given amount of time in order to help protect the community from spam. You can try again later.')
PATTERN_BIRTHDAY = re.compile(r'Birthday</span></div></td><td valign="top" class="\w\w"><div class="\w\w">(?P<month>\w+?) (?P<day>\d+)([, ]+(?P<year>\d+)){0,}</div>')

list_friends = dict()
data_dict = dict()

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate',
}

session = requests.Session()

print('Loading m.facebook.com')
result = session.get(LINK_MAIN)
soup = bs(result.content, 'lxml')
form = soup.select_one('form')
for nput in form.select('input'):
    if not nput.get('name') in ['sign_up']:
        data_dict[nput.get('name')] = nput.get('value')

data_dict['login'] = 'Log In'
data_dict['email'] = ''
data_dict['pass'] = r""

print('Logging in')
time.sleep(5)
result = session.post(LINK_LOGIN,data=data_dict)
print('Logged in!')

time.sleep(5)
result = session.get(LINK_LANGUAGE)
print('Language reset')

print('Reading friends JSON')
with open('data/' + 'list_friends.json', 'r') as f:
    list_friends = json.load(f)

print('Populating friends')
result = session.get(LINK_FRIENDS)
# with open('data/downloaded.html', 'wb') as f:
#     f.write(result.content)
while True:
    # matches = PATTERN_HOVERCARD_NAME.findall(result.text)
    soup = bs(result.content, 'lxml')
    soup_match = None
    soup_matches = soup.select('#friends_center_main table td > a')

    for soup_match in soup_matches:
        name_safe = soup_match.get_text().title()
        link_safe = soup_match.get('href')
        if name_safe not in list_friends:
            print('Added:', name_safe)
            list_friends[name_safe] = {'link_hovercard': LINK_MAIN + link_safe}
    # pprint(list_friends)

    # link_next = PATTERN_SEEMORE.search(result.text)
    soup_match_next = None
    soup_match_next = soup.select_one('#friends_center_main div > a')
    if not soup_match_next:
    # if soup_match_next:
        break
    link_next_safe = soup_match_next.get('href')
    time.sleep(10)
    result = session.get(LINK_MAIN + link_next_safe)
print('Done populating friends!')

print('Writing friends JSON')
with open('data/' + 'list_friends.json', 'w') as f:
    json.dump(list_friends, f, indent=2, ensure_ascii=False)
print('Done writing friends JSON')


for person in list_friends:
    # Visit hovercard link and grab profile link
    reprocess = False
    # if 'link_profile' in list_friends[person] and list_friends[person]['link_profile'] == 'DEACTIVATED':
    #     reprocess = True
    if 'link_profile' not in list_friends[person] or reprocess:
        link = list_friends[person]['link_hovercard']
        time.sleep(10)
        result = session.get(link)
        with open('data/' + 'hovercard_' + person + '.html', 'wb') as f:
            f.write(result.content)
        # match = None
        # match = PATTERN_PROFILE.search(result.text)
        soup = bs(result.content, 'lxml')
        soup_match = None
        soup_match = soup.select_one('#objects_container table td div div:nth-of-type(3) > a')
        print('Visit hovercard > Get profile link:', person)
        if soup_match and soup_match.span.get_text() == 'View Profile':
            link_profile_safe = soup_match.get('href')
            list_friends[person]['link_profile'] = LINK_MAIN + link_profile_safe
        elif PATTERN_DEACTIVATED.search(result.text):
            list_friends[person]['link_profile'] = "DEACTIVATED"
        with open('data/' + 'list_friends.json', 'w') as f:
            json.dump(list_friends, f, indent=2, ensure_ascii=False)

for person in list_friends:
    # Visit profile link and get about link
    if not list_friends[person]['link_profile'] == 'DEACTIVATED':
        if 'link_about' not in list_friends[person]:
            link = list_friends[person]['link_profile']
            time.sleep(30)
            result = session.get(link)
            with open('data/' + 'timeline_' + person + '.html', 'wb') as f:
                f.write(result.content)
            # match = None
            # match = PATTERN_ABOUT.search(result.text)
            soup = bs(result.content, 'lxml')
            soup_match = None
            soup_match = soup.select_one('#m-timeline-cover-section > div:nth-of-type(4) > a')
            soup_match_ratelimit = (soup.title.text == "You Can't Use This Feature Right Now")
            print('Visit profile > Get about link:', person)
            if soup_match and soup_match.get_text() == 'About':
                link_about_safe = soup_match.get('href')
                list_friends[person]['link_about'] = LINK_MAIN + link_about_safe
            else:
                print('ERROR: No about link!')
            with open('data/' + 'list_friends.json', 'w') as f:
                json.dump(list_friends, f, indent=2, ensure_ascii=False)

for person in list_friends:
    # Visit about link and save html to parse later
    if 'link_about' in list_friends[person]:
        filename = 'data/' + 'about_' + person + '.html'
        file = Path(filename)
        redownload = False
        if file.is_file():
            with open(filename) as f:
                file_html = f.read()
            soup = bs(file_html, 'lxml')
            soup_match = None
            soup_match = (soup.title.text == "You Can't Use This Feature Right Now") or \
                (soup.title.text == "Content Not Found") or \
                (soup.title.text == "Error Facebook") or \
                (soup.title.text == "Profile Pictures")
            # match = None
            # match = PATTERN_RATELIMIT.search(file_html)
            if soup_match:
                redownload = True
        if (not file.is_file()) or redownload:
            print('Visit About > Save HTML:', person)
            link = list_friends[person]['link_about']
            time.sleep(30)
            result = session.get(link)
            with open(filename, 'wb') as f:
                f.write(result.content)


# for person in list_friends:

#     # Print Name
#     print('')
#     print(person)

#     # Name
#     fullname = person
#     names = fullname.split(' ')
#     if len(names) == 3:
#         name_first = names[0]
#         name_mid = names[1]
#         name_last = names[2]
#     elif len(names) == 2:
#         name_first = names[0]
#         name_mid = ''
#         name_last = names[1]
#     elif len(names) == 4:
#         name_first = names[0]
#         name_mid = names[1] + ',' + names[2]
#         name_last = names[3]
#     else:
#         name_first = ''
#         name_mid = ''
#         name_last = ''
#     name = '{family};{given};{additional};{prefix};{suffix}'.format(family=name_last, given=name_first, additional=name_mid, prefix='', suffix='')
#     print('FN:' + fullname)
#     print('N:' + name)


# pprint(list_friends)