-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtext_to_csv.py
More file actions
38 lines (32 loc) · 1.49 KB
/
text_to_csv.py
File metadata and controls
38 lines (32 loc) · 1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
import csv
import os
from bs4 import BeautifulSoup
def parse(soup):
# both title and author are can be parsed in separate tags.
author = soup.select_one("h4.h12.talk-link__speaker").text
title = soup.select_one("h4.h9.m5").text
# just need to strip the text from the date string, no regex needed.
date = soup.select_one("span.meta__val").text.strip()
# we want the last time which is the talk-transcript__para__time previous to the footer.
mn, sec = map(int, soup.select_one("footer.footer").find_previous("data", {
"class": "talk-transcript__para__time"}).text.split(":"))
length = (mn * 60 + sec)
# to ignore (Applause) etc.. we can just pull from the actual text fragment checking for (
text = " ".join(d.text for d in soup.select("span.talk-transcript__fragment") if not d.text.startswith("("))
# clean the text
text = re.sub('[^a-zA-Z\.\']', ' ', text)
return author.strip(), title.strip(), date, length, text
def to_csv(pth, out):
# open file to write to.
with open(out, "w") as out:
# create csv.writer.
wr = csv.writer(out)
# write our headers.
wr.writerow(["author", "title", "date", "length", "text"])
# get all our html files.
for html in os.listdir(pth):
with open(os.path.join(pth, html)) as f:
# parse the file are write the data to a row.
wr.writerow(parse(BeautifulSoup(f, "lxml")))
to_csv("./test","output.csv")