getsearchtxt.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. #!/usr/bin/env python3
  2. import sys
  3. import re
  4. import json
  5. import requests
  6. import time
  7. import traceback
  8. import gzip
  9. p=re.compile(r'.*/s/(.*)')
  10. skipp = re.compile(r'.*(cover|screen|频道).*',re.IGNORECASE)
  11. reqcount=1
  12. sharedict=set()
  13. def getlist(w,shareid, fileid,morepage):
  14. global p
  15. global skipp
  16. global reqcount
  17. global sharedict
  18. reqcount += 1
  19. if reqcount % 5 == 0:
  20. print(f"reqcount:{reqcount} shareid:{shareid} fileid:{fileid}",file=sys.stderr)
  21. #time.sleep(1)
  22. url = f'http://192.168.101.188:9978/proxy?do=pikpak&type=list&share_id={shareid}&file_id={fileid}&pass_code=&morepage={morepage}'
  23. print(f"url: {url}",file=sys.stderr)
  24. resp = requests.get(url)
  25. content = resp.content.decode('utf-8')
  26. lines = content.split("\n")
  27. if "folder" not in content and len(lines)<=4:
  28. return
  29. isfirst=True
  30. for line in lines:
  31. if isfirst:
  32. isfirst=False
  33. print(f"first line:{line}",file=sys.stderr)
  34. if skipp.match(line):
  35. continue
  36. linearr = line.split('\t')
  37. if len(linearr)>2:
  38. m = p.match(linearr[0])
  39. if m:
  40. arr = m.group(1).split("/")
  41. else:
  42. arr = linearr[0].split("/")
  43. shareid=arr[0]
  44. fileid=arr[1] if len(arr)>1 else ""
  45. if shareid+"/"+fileid in sharedict:
  46. print(f"skip shareid{shareid} fileid:{fileid}", file=sys.stderr)
  47. continue
  48. w.write(line+"\n")
  49. w.flush()
  50. if linearr[2] == "folder":
  51. getlist(w,shareid,fileid,False)
  52. if len(lines)>0:
  53. getlist(w,shareid,fileid,True)
  54. def main():
  55. try:
  56. f = gzip.open(sys.argv[1]+".raw.gz",mode="rt",encoding="utf-8")
  57. if f is not None:
  58. print(f"found gz raw file:{sys.argv[1]}.raw.gz, extract it",file=sys.stderr)
  59. with(open(sys.argv[1]+".raw","w",encoding="utf-8")) as w:
  60. while(True):
  61. lines = f.readlines()
  62. if len(lines)<=0:
  63. break
  64. for line in lines:
  65. line=line.strip()
  66. w.write(line+"\n")
  67. f.seek(0)
  68. except:
  69. traceback.print_exc()
  70. try:
  71. f = open(sys.argv[1]+".raw","r",encoding="utf-8")
  72. except:
  73. f = None
  74. if f is not None:
  75. print("found old raw file")
  76. while True:
  77. lines = f.readlines()
  78. if len(lines)<=0:
  79. break
  80. for line in lines:
  81. linearr = line.split("\t")
  82. m = p.match(linearr[0])
  83. if m:
  84. arr = m.group(1).split("/")
  85. else:
  86. arr = linearr[0].split("/")
  87. if len(arr)>1:
  88. shareid = arr[0]
  89. fileid = arr[1]
  90. sharedict.add(shareid+"/"+fileid)
  91. f.close()
  92. print(f"old raw file record:{len(sharedict)}")
  93. else:
  94. print("no old raw file")
  95. with(open(sys.argv[1]+".raw","a+",encoding="utf-8")) as w:
  96. with(open(sys.argv[1],"r",encoding="utf-8")) as f:
  97. j = json.load(f)
  98. for c in j:
  99. shareid=c.get("type_id")
  100. fileid=""
  101. m = p.match(shareid)
  102. if m:
  103. arr = m.group(1).split("/")
  104. else:
  105. arr = shareid.split("/")
  106. shareid=arr[0]
  107. fileid=arr[1] if len(arr)>1 else ""
  108. if shareid+"/"+fileid in sharedict:
  109. continue
  110. getlist(w,shareid,fileid,False)
  111. main()