Narrative Website Import

From Gramps
Revision as of 23:45, 18 July 2015 by Sam888 (talk | contribs)
Jump to: navigation, search
Gramps-notes.png This page's factual accuracy may be compromised due to out-of-date information. Please help improve the Gramps Wiki as a useful resource by updating it.
Gnome-important.png
Outdated code

The Script below was written for the Gramps 2.2 NarrativeWeb and needs to be updated to work with a Gramps 3+ or higher versions.

Use a Gramps-created Narrative Website report to restore your Gramps database.

The program below works by parsing the HTML website (also called "screen scrapping") and places the data into a comma-separated value spreadsheet. You can then import it directly into Gramps.

To run the program from the command line, provide the URL of the surname list, like:

 python getnarrative.py http://somewebsite.com/myfamily/ > import.csv

Then, in Gramps you should be able to import the file "import.csv" into an empty database.

Tango-Dialog-information.png
Code

Use the following code getnarrative.py (404 link gone) as a good starting point (As the Script was written for Gramps 2.2 NarrativeWeb and needs to be updated to work with a Gramps 3+ or higher version.
# Python script for sucking a GRAMPS Narrative Website back into
# GRAMPS.

# By Doug Blank <[email protected]>
# License: GPL
# (c) 2007


import os, sys, urllib, re

count = 0
person = {None: None}
family = {}
family_pair = {}
event = {}

def loadPerson(url, surname, firstname):
    global count, person, family, event, family_pair
    junk, handle = url.rsplit("/",1)
    handle = handle.replace(".html", "")
    print >> sys.stderr, "   ", count, surname, ", ", firstname
    count += 1
    pfp = urllib.urlopen(gurl + "/" + url)
    contents = pfp.read()
    state = None
    pairs = []
    pdata = {}
    children = []
    for line in contents.split("\n"):
        matches = re.findall("""<td class="(.*?)">(.*?)</td>""", line)
        for match in matches:
            key, data = match
            if key in ["box"]:
                pass # ignore
            elif key in ["field", "data", "category"]:
                pairs.append((key, data))
        if state == "Families" and line.startswith("<a href"): # child?
            matches = re.match("""<a href="(.*?)">(.*?)</a>.*""", line)
            if matches:
                match = matches.groups()[0]
                if "/ppl/" in match:
                    junk, chandle = match.rsplit("/", 1)
                    chandle = chandle.replace(".html","")
                    children.append(chandle)
        elif "<h" in line:
            matches = re.match("<h.>(.*?)</h.>", line)
            if matches:
                if state != None:
                    if state == "Parents":
                        #print "      Parents:", pairs
                        father, mother = None, None
                        for i in range(len(pairs)):
                            if pairs[i][1] == "Father":
                                father = pairs[i+1][1]
                            if pairs[i][1] == "Mother":
                                mother = pairs[i+1][1]
                        if father:
                            father = father.replace("</a>", "")
                            if "/" in father:
                                junk, fhandle = father.rsplit("/", 1)
                                father, name = fhandle.split(".html",1)
                        if mother:
                            mother = mother.replace("</a>", "")
                            if "/" in mother:
                                junk, mhandle = mother.rsplit("/", 1)
                                mother, name = mhandle.split(".html",1)
                        if (father, mother) in family:
                            family[(father, mother)].append(handle)
                        else:
                            family[(father, mother)] = [handle]
                    elif state == "Families":
                        #print "      Families:", pairs
                        mdata = {"me": handle}
                        mhandle = None
                        for (key, value) in pairs:
                            if key == "category":
                                mdata["type"] = value
                            elif key == "field":
                                mdata["spouse"] = value
                            elif key == "data":
                                value = value.replace("</a>", "")
                                if "/" in value:
                                    junk, handle_name = value.rsplit("/", 1)
                                    mhandle, name = handle_name.split(".html",1)
                        handles = [handle, mhandle]
                        handles.sort()
                        #print "adding", handles, mdata
                        family_pair[tuple(handles)] = mdata
                    elif state == "Events":
                        #print "      Events:", pairs
                        event[(handle, pairs[0][1])] = pairs
                    elif state.strip() == (firstname + " " + surname).strip():
                        pdata = {"surname": surname, 
                                 "firstname": firstname,
                                 "children": children,
                                 "suffix": ""}
                        for i in range(len(pairs)):
                            if pairs[i][0] == "field":
                                pdata[pairs[i][1]] = pairs[i+1][1]
                            i += 1
                        person[handle] = pdata
                    elif state == "Pedigree":
                        state = None
                    elif state in ["Ancestors", "Narrative"]:
                        pass
                    else: # name didn't match exactly
                        state = state.replace(surname, "")
                        state = state.replace(firstname, "")
                        suffix = state.strip()
                        pdata = {"surname": surname, 
                                 "firstname": firstname,
                                 "children": children,
                                 "suffix": ""}
                        if suffix:
                            pdata["suffix"] = suffix
                        for i in range(len(pairs)):
                            if pairs[i][0] == "field":
                                pdata[pairs[i][1]] = pairs[i+1][1]
                            i += 1
                        person[handle] = pdata
                else:
                    pass # new person
                pairs = []
                state = matches.groups()[0]

def loadSurname(url, surname):
    sfp = urllib.urlopen(gurl + "/" + url)
    contents = sfp.read()
    for line in contents.split("\n"):
        list = re.findall("""<a href="(.*?)">(.*?)</a>""", line)
        for surnameURL in list:
            url, firstname = surnameURL
            if url.endswith(".html") and "/ppl/" in url:
                prefix, purl = url.split("/ppl/")
                loadPerson("/ppl/" + purl, surname, firstname)


gurl = sys.argv[1] # URL of surnames
fp = urllib.urlopen(gurl) 
contents = fp.read() # read in website
for line in contents.split("\n"):
    list = re.findall("""<a href="(.*?)">(.*?)</a>""", line)
    for surnameURL in list:
        url, surname = surnameURL
        if url.endswith(".html") and url.startswith("srn"):
            print >> sys.stderr, "Processing surname", surname, "..."
            loadSurname(*surnameURL)

print "person,firstname,lastname,suffix,gender"
for h in person:
    if h:
        print '"%s","%s","%s","%s","%s"' % (h, person[h]["firstname"], 
                                            person[h]["surname"], 
                                            person[h]["suffix"], 
                                            person[h]["Gender"])

for fam in family_pair:
    data = family_pair[fam]
    h1, h2 = fam
    p1, p2 = None, None
    if h1 in person:
        p1 = person[h1]
    if h2 in person:
        p2 = person[h2]
    if p1 and p2:
        if p1["Gender"] == "male" and p2["Gender"] == "female":
            if (h1, h2) in family:
                family[(h1,h2)].append(data["me"])
            else:
                family[(h1,h2)] = [data["me"]]
        else:
            if (h2, h1) in family:
                family[(h2,h1)].append(data["me"])
            else:
                family[(h2,h1)] = [data["me"]]

print
print "marriage,parent1,parent2"
count = 1
marriage = {}
for pair in family:
    marriage[pair] = "F%04d" % count
    print '"%s","%s","%s"' % (marriage[pair], pair[0], pair[1])
    count += 1

print
print "family,child"
for pair in family:
    kids = family[pair]
    kids = set(kids)
    for kid in kids:
        if (kid != pair[0]) or (kid != pair[1]):
            print '"%s","%s"' % (marriage[pair], kid)


See also

Read the following discussion about this code at Lost grdb & [1]