Create csv of dead journalists in Gaza from wikipedia page
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

journalists2csv.py 1.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. from bs4 import BeautifulSoup
  2. import datetime
  3. import csv
  4. import sys
  5. import requests
  6. soup = BeautifulSoup(sys.stdin, "html.parser")
  7. table = soup.find_all("table")[1]
  8. rows = table.find_all("tr")
  9. the_date = ""
  10. sheet = csv.writer(sys.stdout)
  11. labels = [th.get_text().strip() for th in rows[0].find_all("th")]
  12. dups = [{"value": "", "count": 0} for not_used in labels]
  13. sheet.writerow(["Index"] + labels)
  14. index = 0
  15. for row in rows[1:]:
  16. cells = list(reversed([
  17. {"value": td.get_text().strip(), "rowspan": int(td.attrs.get("rowspan", "1"))}
  18. for td in row.find_all("td")
  19. ]))
  20. res = []
  21. for d in dups:
  22. if d["count"]:
  23. res.append(d["value"])
  24. d["count"] -= 1
  25. else:
  26. if cells:
  27. cell = cells.pop()
  28. else:
  29. cell = {"value": "", "rowspan": 1}
  30. res.append(cell["value"])
  31. if cell["rowspan"]>1:
  32. d["count"] = cell["rowspan"] - 1
  33. d["value"] = cell["value"]
  34. index += 1
  35. sheet.writerow([index] + res)