12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- import re
- import requests
- from lxml import html
- def get_subcategories(category_url):
- page = requests.get(category_url)
- tree = html.fromstring(page.content)
- subcategories_container = tree.find('.//div[@id="mw-subcategories"]')
- subcategory_list = []
-
- if subcategories_container == None:
- return subcategory_list
- subcategory_objects = subcategories_container.findall('.//li/div/div/a')
- for subcategory_object in subcategory_objects:
- subcategory_list.append(subcategory_object.text)
- new_subcategory_list = subcategory_list + get_subcategories(domain + subcategory_object.get('href'))
- return new_subcategory_list
- def get_pages(category_url):
- page = requests.get(category_url)
- tree = html.fromstring(page.content)
- pages_container = tree.find('.//div[@id="mw-pages"]')
- page_list = []
-
- if pages_container == None:
- return page_list
- page_objects = pages_container.findall('.//li/a')
- for page_object in page_objects:
- page_list.append(page_object.text)
- next_link = pages_container.find('./a[2]')
- if next_link != None:
- if next_link.text == 'página siguiente':
- new_page_list = page_list + get_pages(domain + next_link.get('href'))
- return new_page_list
- else:
- return page_list
- else:
- return page_list
- domain = 'https://es.wikipedia.org'
- category_name = 'Wikipedia:Wikipedistas de Perú'
- users = []
- categories = get_subcategories(domain + '/wiki/Categoría:' + category_name.replace(' ', '_'))
- categories.append(category_name)
- for category in categories:
- pages_by_category = get_pages(domain + '/wiki/Categoría:' + category.replace(' ', '_'))
-
- for page in pages_by_category:
-
- search_user = re.search(r'^Usuari[a|o]:([^/]+)', page, re.M|re.I)
- if search_user != None:
- user = search_user.group(0)
-
- if user not in users and user.find(':Userbox') != 7:
- users.append(user)
- print(user)
|