Narrative Website Import/tr
Outdated code The Script below was written for the Gramps 2.2 NarrativeWeb and needs to be updated to work with a Gramps 3+ or higher versions. |
This article's content is incomplete or a placeholder stub. |
Use a Gramps-created Narrative Website report to restore your Gramps database.
The program below works by parsing the HTML website (also called "screen scrapping") and places the data into a comma-separated value spreadsheet. You can then import it directly into Gramps.
To run the program from the command line, provide the URL of the surname list, like:
python getnarrative.py http://somewebsite.com/myfamily/ > import.csv
Then, in Gramps you should be able to import the file "import.csv" into an empty database.
Code Use the following code getnarrative.py (404 link gone) as a good starting point (As the Script was written for Gramps 2.2 NarrativeWeb and needs to be updated to work with a Gramps 3+ or higher version. # Python script for sucking a GRAMPS Narrative Website back into # GRAMPS. # By Doug Blank <[email protected]> # License: GPL # (c) 2007 import os, sys, urllib, re count = 0 person = {None: None} family = {} family_pair = {} event = {} def loadPerson(url, surname, firstname): global count, person, family, event, family_pair junk, handle = url.rsplit("/",1) handle = handle.replace(".html", "") print >> sys.stderr, " ", count, surname, ", ", firstname count += 1 pfp = urllib.urlopen(gurl + "/" + url) contents = pfp.read() state = None pairs = [] pdata = {} children = [] for line in contents.split("\n"): matches = re.findall("""<td class="(.*?)">(.*?)</td>""", line) for match in matches: key, data = match if key in ["box"]: pass # ignore elif key in ["field", "data", "category"]: pairs.append((key, data)) if state == "Families" and line.startswith("<a href"): # child? matches = re.match("""<a href="(.*?)">(.*?)</a>.*""", line) if matches: match = matches.groups()[0] if "/ppl/" in match: junk, chandle = match.rsplit("/", 1) chandle = chandle.replace(".html","") children.append(chandle) elif "<h" in line: matches = re.match("<h.>(.*?)</h.>", line) if matches: if state != None: if state == "Parents": #print " Parents:", pairs father, mother = None, None for i in range(len(pairs)): if pairs[i][1] == "Father": father = pairs[i+1][1] if pairs[i][1] == "Mother": mother = pairs[i+1][1] if father: father = father.replace("</a>", "") if "/" in father: junk, fhandle = father.rsplit("/", 1) father, name = fhandle.split(".html",1) if mother: mother = mother.replace("</a>", "") if "/" in mother: junk, mhandle = mother.rsplit("/", 1) mother, name = mhandle.split(".html",1) if (father, mother) in family: family[(father, mother)].append(handle) else: family[(father, mother)] = [handle] elif state == "Families": #print " Families:", pairs mdata = {"me": handle} mhandle = None for (key, value) in pairs: if key == "category": mdata["type"] = value elif key == "field": mdata["spouse"] = value elif key == "data": value = value.replace("</a>", "") if "/" in value: junk, handle_name = value.rsplit("/", 1) mhandle, name = handle_name.split(".html",1) handles = [handle, mhandle] handles.sort() #print "adding", handles, mdata family_pair[tuple(handles)] = mdata elif state == "Events": #print " Events:", pairs event[(handle, pairs[0][1])] = pairs elif state.strip() == (firstname + " " + surname).strip(): pdata = {"surname": surname, "firstname": firstname, "children": children, "suffix": ""} for i in range(len(pairs)): if pairs[i][0] == "field": pdata[pairs[i][1]] = pairs[i+1][1] i += 1 person[handle] = pdata elif state == "Pedigree": state = None elif state in ["Ancestors", "Narrative"]: pass else: # name didn't match exactly state = state.replace(surname, "") state = state.replace(firstname, "") suffix = state.strip() pdata = {"surname": surname, "firstname": firstname, "children": children, "suffix": ""} if suffix: pdata["suffix"] = suffix for i in range(len(pairs)): if pairs[i][0] == "field": pdata[pairs[i][1]] = pairs[i+1][1] i += 1 person[handle] = pdata else: pass # new person pairs = [] state = matches.groups()[0] def loadSurname(url, surname): sfp = urllib.urlopen(gurl + "/" + url) contents = sfp.read() for line in contents.split("\n"): list = re.findall("""<a href="(.*?)">(.*?)</a>""", line) for surnameURL in list: url, firstname = surnameURL if url.endswith(".html") and "/ppl/" in url: prefix, purl = url.split("/ppl/") loadPerson("/ppl/" + purl, surname, firstname) gurl = sys.argv[1] # URL of surnames fp = urllib.urlopen(gurl) contents = fp.read() # read in website for line in contents.split("\n"): list = re.findall("""<a href="(.*?)">(.*?)</a>""", line) for surnameURL in list: url, surname = surnameURL if url.endswith(".html") and url.startswith("srn"): print >> sys.stderr, "Processing surname", surname, "..." loadSurname(*surnameURL) print "person,firstname,lastname,suffix,gender" for h in person: if h: print '"%s","%s","%s","%s","%s"' % (h, person[h]["firstname"], person[h]["surname"], person[h]["suffix"], person[h]["Gender"]) for fam in family_pair: data = family_pair[fam] h1, h2 = fam p1, p2 = None, None if h1 in person: p1 = person[h1] if h2 in person: p2 = person[h2] if p1 and p2: if p1["Gender"] == "male" and p2["Gender"] == "female": if (h1, h2) in family: family[(h1,h2)].append(data["me"]) else: family[(h1,h2)] = [data["me"]] else: if (h2, h1) in family: family[(h2,h1)].append(data["me"]) else: family[(h2,h1)] = [data["me"]] print print "marriage,parent1,parent2" count = 1 marriage = {} for pair in family: marriage[pair] = "F%04d" % count print '"%s","%s","%s"' % (marriage[pair], pair[0], pair[1]) count += 1 print print "family,child" for pair in family: kids = family[pair] kids = set(kids) for kid in kids: if (kid != pair[0]) or (kid != pair[1]): print '"%s","%s"' % (marriage[pair], kid) |
See also
Read the following discussion about this code at Lost grdb(2007) & [1]