The following script structures data into a single JSON file
This Script is related to the webscraping and Text Markup blog post
At the beginning and outside the loop a variable called “allFiles”, as an empty list, is created to store all the ouputs from the loop via the “var” variable in one list.
In the second loop the variable “var” (at the very end of the loop) is created and structures the data as JSON data.
Finally, the with open statement creates one single JSON file called summary.json. This final statement must be outside the loop.
import re, os, json
source = "~/Documents/Dh_Tools/lesson7/5th/"
target = "~/Documents/Dh_Tools/lesson9/combined_all/"
lof = os.listdir(source)
counter = 0 # general counter to keep track of the progress
allFiles = []
for f in lof:
if f.startswith("dltext"): # fileName test
with open(source + f, "r", encoding="utf8") as f1:
text = f1.read()
# try to find the date
date = re.search(r'<date value="([\d-]+)"', text).group(1)
# splitting the issue into articles/items
split = re.split("<div3 ", text)
c = 0 # item counter
for s in split[1:]:
c += 1
s = "<div3 " + s # a step to restore the integrity of items
#input(s)
# try to find a unitType
try:
unitType = re.search(r'type="([^\"]+)"', s).group(1)
except:
unitType = "noType"
print(s)
# try to find a header
try:
header = re.search(r'<head>(.*)</head>', s).group(1)
header = re.sub("<[^<]+>", "", header)
except:
header = "NO HEADER"
print("\nNo header found!\n")
text = re.sub("<[^<]+>", "", s)
#text = re.sub(" +\n|\n +", "\n", text)
#text = re.sub("\n+", ";;; ", text)
# generating necessary bits
fName = date+"_"+unitType+"_"+str(c)
itemID = date+"_"+unitType+"_"+str(c)
dateVar = date
unitType = unitType
header = header
text = text
# creating a json variable
var = {
"ID" : itemID,
"date" : dateVar,
"type" : unitType,
"header" : header,
"text" : text
}
#input(var)
allFiles.append(var)
# count processed issues and print progress counter at every 100
counter += 1
if counter % 100 == 0:
print(counter)
with open(target+"summary.json", "w", encoding="utf8") as f9:
f9.write(str(allFiles))