alexa.py3 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #!/usr/bin/python
  2. # Title : alexa.py
  3. # Description: Alexa Site Overview parser
  4. # Author : linuxitux
  5. # Date : 22-08-2016
  6. # Usage : ./alexa.py
  7. # Notes : Edit blogsf accordingly
  8. import urllib.request, urllib.error, urllib.parse
  9. import sys
  10. import time
  11. import operator
  12. import datetime
  13. # File containing domains, one per line
  14. blogsf = "~/blogs.txt"
  15. # Alexa Site Overview URL
  16. url = "http://www.alexa.com/siteinfo/"
  17. # Please, be gentle
  18. sleeptime = 3
  19. # Rank delimiters
  20. substr1 = "<strong "
  21. substr2 = "</strong"
  22. substr3 = "-->"
  23. # Get current date
  24. date = time.strftime("%Y-%m-%d")
  25. # Database <--- facepalm
  26. outfile = open("~/ranks_"+date, "w")
  27. # Get sites list (one per line)
  28. with open(blogsf) as blogs:
  29. sites = blogs.readlines()
  30. # Table to store ranks
  31. results = []
  32. for site in sites:
  33. # Get page
  34. req = urllib.request.Request(url=url+site)
  35. p = urllib.request.urlopen(req)
  36. resp = str(p.read())
  37. # Parse HTML response
  38. resp = resp[resp.index(substr1):]
  39. resp = resp[:resp.index(substr2)]
  40. resp = resp[resp.index(substr3)+3:]
  41. resp = resp.replace(" ","")
  42. rank = resp.lstrip('\\n')
  43. # Discard newline in site
  44. site = site.replace("\n","")
  45. # Discard commas in rank
  46. rank = rank.replace(",","")
  47. # Store (site,rank) pair in table
  48. try:
  49. row = (site,int(rank))
  50. results.append(row)
  51. except Exception:
  52. print(site+" has no rank", file=sys.stderr)
  53. row = (site,0)
  54. results.append(row)
  55. time.sleep(sleeptime)
  56. # Sort and print table in HTML format
  57. # Print style
  58. print("""
  59. <style>
  60. .alexaranks {
  61. font-family: monospace;
  62. font-size: 13px;
  63. color: #333;
  64. }
  65. a, a:active, a:focus, a:visited {
  66. color: #2c9c30;
  67. }
  68. </style>
  69. """)
  70. # Print date
  71. print('<p class="alexaranks"><i>Actualizado: '+datetime.date.today().strftime('%d/%m/%Y')+'</i></p>')
  72. # Print table headers
  73. print('<table id="alexaranks" class="alexaranks">')
  74. # Counter
  75. count = 1
  76. # Print sorted rows (by rank)
  77. for srow in sorted(results,key=lambda row: row[1]):
  78. site, rank = srow
  79. # Print line, if rank not zero
  80. if rank > 0:
  81. print('<tr><td>'+str(count)+'</td><td><a href="http://'+site+'/">'+site+'</a></td><td style="text-align: right;">'+"{:,}".format(rank)+'</td></tr>')
  82. outfile.write(site+","+str(rank)+"\n")
  83. count = count+1
  84. # Print table closing tag
  85. print('</table>')
  86. print('<p> </p>')
  87. # "Disconnect" from "database"
  88. outfile.close()
  89. # End