mirror of
https://codeberg.org/aryak/mozhi
synced 2024-10-30 08:33:01 +05:30
67 lines
1.6 KiB
Python
67 lines
1.6 KiB
Python
|
#!/usr/bin/python3
|
||
|
import requests
|
||
|
import json
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
print("Getting HTML")
|
||
|
|
||
|
headers = {
|
||
|
'User-Agent': 'Mozilla/5.0 MozhiInstanceFetcher/1.0 (+codeberg.org/aryak/mozhi)'
|
||
|
}
|
||
|
|
||
|
# Get the HTML from the page
|
||
|
r = requests.get('https://codeberg.org/aryak/mozhi', headers=headers)
|
||
|
|
||
|
# Parse the HTML
|
||
|
soup = BeautifulSoup(r.text, 'html.parser')
|
||
|
|
||
|
print("Scraping started")
|
||
|
|
||
|
# Get tables
|
||
|
tables = soup.find_all('table')
|
||
|
|
||
|
# Get table with header 'Master Branch'
|
||
|
table = tables[1]
|
||
|
|
||
|
# Get all rows and columns. Skip the first row because it's the header
|
||
|
rows = table.find_all('tr')[1:]
|
||
|
|
||
|
theJson = []
|
||
|
|
||
|
for row in rows:
|
||
|
|
||
|
link = row.find_all('td')[0].find('a')['href']
|
||
|
cloudflare = row.find_all('td')[1].text
|
||
|
country = row.find_all('td')[2].text
|
||
|
host = row.find_all('td')[3].text
|
||
|
|
||
|
print("Scraping " + row.find_all('td')[0].find('a')['href'] + ' instance...')
|
||
|
if cloudflare == 'Yes':
|
||
|
isCloudflare = True
|
||
|
else:
|
||
|
isCloudflare = False
|
||
|
|
||
|
try:
|
||
|
r = requests.get(link + '/', headers=headers)
|
||
|
if r.status_code != 200:
|
||
|
print("Error while fetching " + link + '/. We got a ' + str(r.status_code) + ' status code. Skipping...')
|
||
|
continue
|
||
|
except:
|
||
|
print("Error while fetching " + link + '/. Skipping...')
|
||
|
continue
|
||
|
|
||
|
theJson.append({
|
||
|
'country': country,
|
||
|
'link': link,
|
||
|
'cloudflare': isCloudflare,
|
||
|
'host': host,
|
||
|
})
|
||
|
|
||
|
|
||
|
print("Scraping finished. Saving JSON...")
|
||
|
|
||
|
# save JSON
|
||
|
with open('instances.json', 'w') as outfile:
|
||
|
json.dump(theJson, outfile, indent=4)
|
||
|
print("File saved as instances.json")
|