rip_from_facebook.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. #!/usr/bin/env python
  2. import re
  3. import html
  4. import time
  5. import calendar
  6. import json
  7. from pprint import pprint
  8. from pathlib import Path
  9. import requests
  10. from bs4 import BeautifulSoup as bs
  11. LINK_MAIN = 'https://m.facebook.com'
  12. LINK_LOGIN = 'https://m.facebook.com/login/device-based/regular/login/?refsrc=https://m.facebook.com/&lwv=100&refid=8'
  13. LINK_FRIENDS = 'https://m.facebook.com/friends/center/friends/'
  14. LINK_LANGUAGE = 'https://m.facebook.com/a/language.php?l=en_US&lref=%2Fsettings%2Flanguage%2F&sref=legacy_mobile_settings&gfid=AQADZjSUoWMlr7lH'
  15. PATTERN_HOVERCARD_NAME = re.compile(r'<a class="b[qo]" href="(?P<link>.+?)">(?P<name>.+?)</a>')
  16. PATTERN_SEEMORE = re.compile(r'[0ki]"><a href="(.+?)"><span>See More</span></a>')
  17. PATTERN_PROFILE = re.compile(r'<div class="(?:bc|x)"><a href="(.+?)"')
  18. PATTERN_DEACTIVATED = re.compile(r'This account has been deactivated.')
  19. PATTERN_ABOUT = re.compile(r'[dgl]"><a href="(.+?)" class="\w\w">About</a>')
  20. PATTERN_RATELIMIT = re.compile(r'We limit how often you can post, comment or do other things in a given amount of time in order to help protect the community from spam. You can try again later.')
  21. PATTERN_BIRTHDAY = re.compile(r'Birthday</span></div></td><td valign="top" class="\w\w"><div class="\w\w">(?P<month>\w+?) (?P<day>\d+)([, ]+(?P<year>\d+)){0,}</div>')
  22. list_friends = dict()
  23. data_dict = dict()
  24. headers = {
  25. 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0',
  26. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  27. 'Accept-Language': 'en-US,en;q=0.9',
  28. 'Accept-Encoding': 'gzip, deflate',
  29. }
  30. session = requests.Session()
  31. print('Loading m.facebook.com')
  32. result = session.get(LINK_MAIN)
  33. soup = bs(result.content, 'lxml')
  34. form = soup.select_one('form')
  35. for nput in form.select('input'):
  36. if not nput.get('name') in ['sign_up']:
  37. data_dict[nput.get('name')] = nput.get('value')
  38. data_dict['login'] = 'Log In'
  39. data_dict['email'] = ''
  40. data_dict['pass'] = r""
  41. print('Logging in')
  42. time.sleep(5)
  43. result = session.post(LINK_LOGIN,data=data_dict)
  44. print('Logged in!')
  45. time.sleep(5)
  46. result = session.get(LINK_LANGUAGE)
  47. print('Language reset')
  48. print('Reading friends JSON')
  49. with open('data/' + 'list_friends.json', 'r') as f:
  50. list_friends = json.load(f)
  51. print('Populating friends')
  52. result = session.get(LINK_FRIENDS)
  53. # with open('data/downloaded.html', 'wb') as f:
  54. # f.write(result.content)
  55. while True:
  56. # matches = PATTERN_HOVERCARD_NAME.findall(result.text)
  57. soup = bs(result.content, 'lxml')
  58. soup_match = None
  59. soup_matches = soup.select('#friends_center_main table td > a')
  60. for soup_match in soup_matches:
  61. name_safe = soup_match.get_text().title()
  62. link_safe = soup_match.get('href')
  63. if name_safe not in list_friends:
  64. print('Added:', name_safe)
  65. list_friends[name_safe] = {'link_hovercard': LINK_MAIN + link_safe}
  66. # pprint(list_friends)
  67. # link_next = PATTERN_SEEMORE.search(result.text)
  68. soup_match_next = None
  69. soup_match_next = soup.select_one('#friends_center_main div > a')
  70. if not soup_match_next:
  71. # if soup_match_next:
  72. break
  73. link_next_safe = soup_match_next.get('href')
  74. time.sleep(10)
  75. result = session.get(LINK_MAIN + link_next_safe)
  76. print('Done populating friends!')
  77. print('Writing friends JSON')
  78. with open('data/' + 'list_friends.json', 'w') as f:
  79. json.dump(list_friends, f, indent=2, ensure_ascii=False)
  80. print('Done writing friends JSON')
  81. for person in list_friends:
  82. # Visit hovercard link and grab profile link
  83. reprocess = False
  84. # if 'link_profile' in list_friends[person] and list_friends[person]['link_profile'] == 'DEACTIVATED':
  85. # reprocess = True
  86. if 'link_profile' not in list_friends[person] or reprocess:
  87. link = list_friends[person]['link_hovercard']
  88. time.sleep(10)
  89. result = session.get(link)
  90. with open('data/' + 'hovercard_' + person + '.html', 'wb') as f:
  91. f.write(result.content)
  92. # match = None
  93. # match = PATTERN_PROFILE.search(result.text)
  94. soup = bs(result.content, 'lxml')
  95. soup_match = None
  96. soup_match = soup.select_one('#objects_container table td div div:nth-of-type(3) > a')
  97. print('Visit hovercard > Get profile link:', person)
  98. if soup_match and soup_match.span.get_text() == 'View Profile':
  99. link_profile_safe = soup_match.get('href')
  100. list_friends[person]['link_profile'] = LINK_MAIN + link_profile_safe
  101. elif PATTERN_DEACTIVATED.search(result.text):
  102. list_friends[person]['link_profile'] = "DEACTIVATED"
  103. with open('data/' + 'list_friends.json', 'w') as f:
  104. json.dump(list_friends, f, indent=2, ensure_ascii=False)
  105. for person in list_friends:
  106. # Visit profile link and get about link
  107. if not list_friends[person]['link_profile'] == 'DEACTIVATED':
  108. if 'link_about' not in list_friends[person]:
  109. link = list_friends[person]['link_profile']
  110. time.sleep(30)
  111. result = session.get(link)
  112. with open('data/' + 'timeline_' + person + '.html', 'wb') as f:
  113. f.write(result.content)
  114. # match = None
  115. # match = PATTERN_ABOUT.search(result.text)
  116. soup = bs(result.content, 'lxml')
  117. soup_match = None
  118. soup_match = soup.select_one('#m-timeline-cover-section > div:nth-of-type(4) > a')
  119. soup_match_ratelimit = (soup.title.text == "You Can't Use This Feature Right Now")
  120. print('Visit profile > Get about link:', person)
  121. if soup_match and soup_match.get_text() == 'About':
  122. link_about_safe = soup_match.get('href')
  123. list_friends[person]['link_about'] = LINK_MAIN + link_about_safe
  124. else:
  125. print('ERROR: No about link!')
  126. with open('data/' + 'list_friends.json', 'w') as f:
  127. json.dump(list_friends, f, indent=2, ensure_ascii=False)
  128. for person in list_friends:
  129. # Visit about link and save html to parse later
  130. if 'link_about' in list_friends[person]:
  131. filename = 'data/' + 'about_' + person + '.html'
  132. file = Path(filename)
  133. redownload = False
  134. if file.is_file():
  135. with open(filename) as f:
  136. file_html = f.read()
  137. soup = bs(file_html, 'lxml')
  138. soup_match = None
  139. soup_match = (soup.title.text == "You Can't Use This Feature Right Now") or \
  140. (soup.title.text == "Content Not Found") or \
  141. (soup.title.text == "Error Facebook") or \
  142. (soup.title.text == "Profile Pictures")
  143. # match = None
  144. # match = PATTERN_RATELIMIT.search(file_html)
  145. if soup_match:
  146. redownload = True
  147. if (not file.is_file()) or redownload:
  148. print('Visit About > Save HTML:', person)
  149. link = list_friends[person]['link_about']
  150. time.sleep(30)
  151. result = session.get(link)
  152. with open(filename, 'wb') as f:
  153. f.write(result.content)
  154. # for person in list_friends:
  155. # # Print Name
  156. # print('')
  157. # print(person)
  158. # # Name
  159. # fullname = person
  160. # names = fullname.split(' ')
  161. # if len(names) == 3:
  162. # name_first = names[0]
  163. # name_mid = names[1]
  164. # name_last = names[2]
  165. # elif len(names) == 2:
  166. # name_first = names[0]
  167. # name_mid = ''
  168. # name_last = names[1]
  169. # elif len(names) == 4:
  170. # name_first = names[0]
  171. # name_mid = names[1] + ',' + names[2]
  172. # name_last = names[3]
  173. # else:
  174. # name_first = ''
  175. # name_mid = ''
  176. # name_last = ''
  177. # name = '{family};{given};{additional};{prefix};{suffix}'.format(family=name_last, given=name_first, additional=name_mid, prefix='', suffix='')
  178. # print('FN:' + fullname)
  179. # print('N:' + name)
  180. # pprint(list_friends)