Create csv of dead journalists in Gaza from wikipedia page
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

journalists2csv.py 1.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. from bs4 import BeautifulSoup
  2. import datetime
  3. import csv
  4. import sys
  5. import requests
  6. soup = BeautifulSoup(sys.stdin, "html.parser")
  7. table = soup.find_all("table")[1]
  8. rows = table.find_all("tr")
  9. the_date = ""
  10. sheet = csv.writer(sys.stdout)
  11. labels = [th.get_text().strip() for th in rows[0].find_all("th")]
  12. dups = [{"value": "", "count": 0} for not_used in labels]
  13. sheet.writerow(["Index"] + labels)
  14. index = 0
  15. for row in rows[1:]:
  16. cells = list(reversed([
  17. {"value": td.get_text().split("\xa0")[0].strip(), "rowspan": int(td.attrs.get("rowspan", "1"))}
  18. for td in row.find_all("td")
  19. ]))
  20. res = []
  21. for d in dups:
  22. if d["count"]:
  23. res.append(d["value"])
  24. d["count"] -= 1
  25. else:
  26. if cells:
  27. cell = cells.pop()
  28. else:
  29. cell = {"value": "", "rowspan": 1}
  30. res.append(cell["value"])
  31. if cell["rowspan"]>1:
  32. d["count"] = cell["rowspan"] - 1
  33. d["value"] = cell["value"]
  34. index += 1
  35. sheet.writerow([index] + res)