From d2dbe4427da5be2b4ed3b774d1648afb5dcd9c55 Mon Sep 17 00:00:00 2001 From: Tim G L Lyons Date: Mon, 11 Mar 2013 18:44:01 +0000 Subject: [PATCH] Exportgedcom and importgedcom. 0006382: ADDR tag in GEDCOM export does not contain a full address. On export, all the elements of the structured address are output as ADDR/CONT. On import the structured address takes precedence (as at present), but a warning is only given if an element in the free-form address is missing from the structured address. svn: r21608 --- src/plugins/export/ExportGedcom.py | 130 +++++++++--------- src/plugins/lib/libgedcom.py | 214 +++++++++++++++++------------ 2 files changed, 188 insertions(+), 156 deletions(-) diff --git a/src/plugins/export/ExportGedcom.py b/src/plugins/export/ExportGedcom.py index cfd7c6879..c9a95d464 100644 --- a/src/plugins/export/ExportGedcom.py +++ b/src/plugins/export/ExportGedcom.py @@ -364,39 +364,19 @@ class GedcomWriter(UpdateCallback): """ owner = self.dbase.get_researcher() name = owner.get_name() - addr = owner.get_address() - adr2 = owner.get_locality() - city = owner.get_city() - state = owner.get_state() - ctry = owner.get_country() - post = owner.get_postal_code() phon = owner.get_phone() mail = owner.get_email() - if not name : - name = u'Not Provided' - if not addr : - addr = u'Not Provided' - self.__writeln(0, "@SUBM@", "SUBM") self.__writeln(1, "NAME", name) - self.__writeln(1, "ADDR", addr) - if city and state and post: - self.__writeln(2, "CONT", "%s, %s %s" % (city, state, post)) - else: - self.__writeln(2, "CONT", u"Not Provided") - if addr: - self.__writeln(2, "ADR1", addr) - if adr2: - self.__writeln(2, "ADR2", adr2) - if city: - self.__writeln(2, "CITY", city) - if state: - self.__writeln(2, "STAE", state) - if post: - self.__writeln(2, "POST", post) - if ctry: - self.__writeln(2, "CTRY", ctry) + + # Researcher is a sub-type of LocationBase, so get_city etc. which are + # used in __write_addr work fine. However, the database owner street is + # stored in address, so we need to temporarily copy it into street so + # __write_addr works properly + owner.set_street(owner.get_address()) + self.__write_addr(1, owner) + if phon: self.__writeln(1, "PHON", phon) if mail: @@ -687,19 +667,7 @@ class GedcomWriter(UpdateCallback): for addr in person.get_address_list(): self.__writeln(1, 'RESI') self.__date(2, addr.get_date_object()) - self.__writeln(2, "ADDR", addr.get_street()) - if addr.get_street(): - self.__writeln(3, 'ADR1', addr.get_street()) - if addr.get_locality(): - self.__writeln(3, 'ADR2', addr.get_locality()) - if addr.get_city(): - self.__writeln(3, 'CITY', addr.get_city()) - if addr.get_state(): - self.__writeln(3, 'STAE', addr.get_state()) - if addr.get_postal_code(): - self.__writeln(3, 'POST', addr.get_postal_code()) - if addr.get_country(): - self.__writeln(3, 'CTRY', addr.get_country()) + self.__write_addr(2, addr) if addr.get_phone(): self.__writeln(2, 'PHON', addr.get_phone()) @@ -1017,19 +985,7 @@ class GedcomWriter(UpdateCallback): if repo.get_name(): self.__writeln(1, 'NAME', repo.get_name()) for addr in repo.get_address_list(): - self.__writeln(1, "ADDR", addr.get_street()) - if addr.get_street(): - self.__writeln(2, 'ADR1', addr.get_street()) - if addr.get_locality(): - self.__writeln(2, 'ADR2', addr.get_locality()) - if addr.get_city(): - self.__writeln(2, 'CITY', addr.get_city()) - if addr.get_state(): - self.__writeln(2, 'STAE', addr.get_state()) - if addr.get_postal_code(): - self.__writeln(2, 'POST', addr.get_postal_code()) - if addr.get_country(): - self.__writeln(2, 'CTRY', addr.get_country()) + self.__write_addr(1, addr) if addr.get_phone(): self.__writeln(1, 'PHON', addr.get_phone()) for url in repo.get_url_list(): @@ -1412,24 +1368,64 @@ class GedcomWriter(UpdateCallback): # http://homepages.rootsweb.com/~pmcbride/gedcom/55gcch2.htm#EVENT_DETAIL location = place.get_main_location() if location and not location.is_empty(): - self.__writeln(level, "ADDR", location.get_street()) - if location.get_street(): - self.__writeln(level + 1, 'ADR1', location.get_street()) - if location.get_locality(): - self.__writeln(level + 1, 'ADR2', location.get_locality()) - if location.get_city(): - self.__writeln(level + 1, 'CITY', location.get_city()) - if location.get_state(): - self.__writeln(level + 1, 'STAE', location.get_state()) - if location.get_postal_code(): - self.__writeln(level + 1, 'POST', location.get_postal_code()) - if location.get_country(): - self.__writeln(level + 1, 'CTRY', location.get_country()) + self.__write_addr(level, location) if location.get_phone(): self.__writeln(level, 'PHON', location.get_phone()) self.__note_references(place.get_note_list(), level+1) + def __write_addr(self, level, addr): + """ + n ADDR {0:1} + +1 CONT {0:M} + +1 ADR1 {0:1} (Street) + +1 ADR2 {0:1} (Locality) + +1 CITY {0:1} + +1 STAE {0:1} + +1 POST {0:1} + +1 CTRY {0:1} + + This is done along the lines suggested by Tamura Jones in + http://www.tamurajones.net/GEDCOMADDR.xhtml as a result of bug 6382. + "GEDCOM writers should always use the structured address format, + and it use it for all addresses, including the submitter address and + their own corporate address." "Vendors that want their product to pass + even the strictest GEDCOM validation, should include export to the old + free-form format..." [This goes on to say the free-form should be an + option, but we have not made it an option in Gramps]. + + @param level: The level number for the ADDR tag + @type level: Integer + @param addr: The location or address + @type addr: [a super-type of] LocationBase + """ + if addr.get_street() or addr.get_locality() or addr.get_city() or \ + addr.get_state() or addr.get_postal_code or addr.get_country(): + self.__writeln(level, 'ADDR', addr.get_street()) + if addr.get_locality(): + self.__writeln(level + 1, 'CONT', addr.get_locality()) + if addr.get_city(): + self.__writeln(level + 1, 'CONT', addr.get_city()) + if addr.get_state(): + self.__writeln(level + 1, 'CONT', addr.get_state()) + if addr.get_postal_code(): + self.__writeln(level + 1, 'CONT', addr.get_postal_code()) + if addr.get_country(): + self.__writeln(level + 1, 'CONT', addr.get_country()) + + if addr.get_street(): + self.__writeln(level + 1, 'ADR1', addr.get_street()) + if addr.get_locality(): + self.__writeln(level + 1, 'ADR2', addr.get_locality()) + if addr.get_city(): + self.__writeln(level + 1, 'CITY', addr.get_city()) + if addr.get_state(): + self.__writeln(level + 1, 'STAE', addr.get_state()) + if addr.get_postal_code(): + self.__writeln(level + 1, 'POST', addr.get_postal_code()) + if addr.get_country(): + self.__writeln(level + 1, 'CTRY', addr.get_country()) + #------------------------------------------------------------------------- # # diff --git a/src/plugins/lib/libgedcom.py b/src/plugins/lib/libgedcom.py index a636e96ad..8a2631f2e 100644 --- a/src/plugins/lib/libgedcom.py +++ b/src/plugins/lib/libgedcom.py @@ -98,6 +98,7 @@ from xml.parsers.expat import ParserCreate from collections import defaultdict import cStringIO from urlparse import urlparse +import string #------------------------------------------------------------------------ # @@ -629,9 +630,6 @@ DATE_QUALITY = { # regular expressions # #------------------------------------------------------------------------- -ADDR_RE = re.compile('(.+)([\n\r]+)(.+)\s*, (.+)\s+(\d+)\s*(.*)') -ADDR2_RE = re.compile('(.+)([\n\r]+)(.+)\s*, (.+)\s+(\d+)') -ADDR3_RE = re.compile('(.+)([\n\r]+)(.+)\s*, (.+)') NOTE_RE = re.compile(r"\s*\d+\s+\@(\S+)\@\s+NOTE(.*)$") CONT_RE = re.compile(r"\s*\d+\s+CONT\s?(.*)$") CONC_RE = re.compile(r"\s*\d+\s+CONC\s?(.*)$") @@ -2087,6 +2085,7 @@ class GedcomParser(UpdateCallback): TOKEN_SOUR : self.__event_source, TOKEN_PLAC : self.__event_place, TOKEN_ADDR : self.__event_addr, + TOKEN_PHON : self.__event_phon, TOKEN_CAUS : self.__event_cause, TOKEN_AGNC : self.__event_agnc, TOKEN_AGE : self.__event_age, @@ -2198,14 +2197,15 @@ class GedcomParser(UpdateCallback): self.func_list.append(self.object_parse_tbl) self.parse_loc_tbl = { - TOKEN_ADDR : self.__location_addr, TOKEN_ADR1 : self.__location_adr1, TOKEN_ADR2 : self.__location_adr2, - TOKEN_DATE : self.__location_date, TOKEN_CITY : self.__location_city, TOKEN_STAE : self.__location_stae, TOKEN_POST : self.__location_post, TOKEN_CTRY : self.__location_ctry, + # Not legal GEDCOM - not clear why these are included at this level + TOKEN_ADDR : self.__ignore, + TOKEN_DATE : self.__location_date, TOKEN_NOTE : self.__location_note, TOKEN_RNOTE : self.__location_note, TOKEN__LOC : self.__ignore, @@ -2263,8 +2263,7 @@ class GedcomParser(UpdateCallback): # +1 <> {0:1} TOKEN_CHAN : self.__family_chan, TOKEN_ENDL : self.__ignore, - - TOKEN_ADDR : self.__family_addr, + TOKEN_ADDR : self.__ignore, TOKEN_RIN : self.__family_cust_attr, TOKEN_SUBM : self.__ignore, TOKEN_ATTR : self.__family_attr, @@ -3120,6 +3119,69 @@ class GedcomParser(UpdateCallback): self.__add_msg(txt) self.number_of_errors -= 1 + def __merge_address(self, free_form_address, addr, line, state): + """ + Merge freeform and structured addrssses. + n ADDR {0:1} + +1 CONT {0:M} + +1 ADR1 {0:1} (Street) + +1 ADR2 {0:1} (Locality) + +1 CITY {0:1} + +1 STAE {0:1} + +1 POST {0:1} + +1 CTRY {0:1} + + This is done along the lines suggested by Tamura Jones in + http://www.tamurajones.net/GEDCOMADDR.xhtml as a result of bug 6382. + "When a GEDCOM reader encounters a double address, it should read the + structured address. ... A GEDCOM reader that does verify that the + addresses are the same should issue an error if they are not". + + This is called for SUBMitter addresses (__subm_addr), INDIvidual + addresses (__person_addr), REPO addresses and HEADer corp address + (__repo_address) and EVENt addresses (__event_adr). + + The structured address (if any) will have been accumulated into an + object of type LocationBase, which will either be a Location, or an + Address object. + + If ADDR is provided, but none of ADR1, ADR2, CITY, STAE, or POST (not + CTRY), then Street is set to the freeform address. N.B. this is a change + for Repository addresses and HEADer Corp address where previously the + free-form address was deconstrucated into different structured + components. N.B. PAF provides a free-form address and a country, so this + allows for that case. + + If both forms of address are provided, then the structured address is + used, and if the ADDR/CONT contains anything not in the structured + address, a warning is issued. + + If just ADR1, ADR2, CITY, STAE, POST or CTRY are provided (this is not + actually legal GEDCOM symtax, but may be possible by GEDCOM extensions) + then just the structrued address is used. + """ + if not (addr.get_street() or addr.get_locality() or + addr.get_city() or addr.get_state() or + addr.get_postal_code()): + + addr.set_street(free_form_address) + else: + # structured address provided + addr_list = free_form_address.split("\n") + str_list = [] + for func in (addr.get_street(), addr.get_locality(), + addr.get_city(), addr.get_state(), + addr.get_postal_code(), addr.get_country()): + str_list += [i.strip(',' + string.whitespace) for i in func.split("\n")] + for elmn in addr_list: + if elmn.strip(',' + string.whitespace) not in str_list: + # message means that the element %s was ignored, but + # expressed the wrong way round because the message is + # truncated for output + self.__add_msg(_("ADDR element ignored '%s'" + % elmn), line, state) + # The free-form address ADDR is discarded + def __parse_trailer(self): """ Looks for the expected TRLR token @@ -3756,7 +3818,7 @@ class GedcomParser(UpdateCallback): def __person_addr(self, line, state): """ - Parses the Address structure + Parses the INDIvidual n ADDR {0:1} +1 CONT {0:M} @@ -3766,20 +3828,22 @@ class GedcomParser(UpdateCallback): +1 STAE {0:1} +1 POST {0:1} +1 CTRY {0:1} - n PHON {0:3} @param line: The current line in GedLine format @type line: GedLine @param state: The current state @type state: CurrentState """ - sub_state = CurrentState() - sub_state.level = state.level+1 + free_form = line.data + + sub_state = CurrentState(level=state.level + 1) sub_state.addr = gen.lib.Address() - sub_state.addr.set_street(line.data) - state.person.add_address(sub_state.addr) + self.__parse_level(sub_state, self.parse_addr_tbl, self.__ignore) state.msg += sub_state.msg + + self.__merge_address(free_form, sub_state.addr, line, state) + state.person.add_address(sub_state.addr) def __person_phon(self, line, state): """ @@ -4871,17 +4935,6 @@ class GedcomParser(UpdateCallback): """ self.__parse_change(line, state.family, state.level+1, state) - def __family_addr(self, line, state): - """ - @param line: The current line in GedLine format - @type line: GedLine - @param state: The current state - @type state: CurrentState - """ - state.addr = gen.lib.Address() - state.addr.set_street(line.data) - self.__parse_level(state, self.parse_addr_tbl, self.__ignore) - def __family_attr(self, line, state): """ @param line: The current line in GedLine format @@ -5232,20 +5285,34 @@ class GedcomParser(UpdateCallback): def __event_addr(self, line, state): """ + Parses the EVENt + + n ADDR {0:1} + +1 CONT {0:M} + +1 ADR1 {0:1} (Street) + +1 ADR2 {0:1} (Locality) + +1 CITY {0:1} + +1 STAE {0:1} + +1 POST {0:1} + +1 CTRY {0:1} + @param line: The current line in GedLine format @type line: GedLine @param state: The current state @type state: CurrentState """ + free_form = line.data + sub_state = CurrentState(level=state.level+1) sub_state.location = gen.lib.Location() - sub_state.location.set_street(line.data) sub_state.note = [] sub_state.event = state.event self.__parse_level(sub_state, self.parse_loc_tbl, self.__undefined) state.msg += sub_state.msg + self.__merge_address(free_form, sub_state.location, line, state) + location = sub_state.location note_list = sub_state.note @@ -5646,6 +5713,7 @@ class GedcomParser(UpdateCallback): @type state: CurrentState """ # The ADDR may already have been parsed by the level above + assert state.addr.get_street() == "" if state.addr.get_street() != "": self.__add_msg(_("Warn: ADDR overwritten"), line, state) state.addr.set_street(line.data) @@ -5916,6 +5984,7 @@ class GedcomParser(UpdateCallback): state = CurrentState() state.source = self.__find_or_create_source(self.sid_map[name]) + # SOURce with the given gramps_id had no title state.source.set_title(_("No title - ID %s") % state.source.get_gramps_id()) state.level = level @@ -6376,6 +6445,8 @@ class GedcomParser(UpdateCallback): def __repo_addr(self, line, state): """ + Parses the REPOsitory and HEADer COPR + n ADDR {0:1} +1 CONT {0:M} +1 ADR1 {0:1} (Street) @@ -6384,52 +6455,22 @@ class GedcomParser(UpdateCallback): +1 STAE {0:1} +1 POST {0:1} +1 CTRY {0:1} - n PHON {0:3} - Some repositories do not try to break up the address, - instead they put everything on a single line. Try to determine - if this happened, and try to fix it. + @param line: The current line in GedLine format + @type line: GedLine + @param state: The current state + @type state: CurrentState """ + free_form = line.data - addr = gen.lib.Address() - addr.set_street(line.data) - - sub_state = CurrentState() - sub_state.level = state.level+1 - sub_state.addr = addr + sub_state = CurrentState(level=state.level + 1) + sub_state.addr = gen.lib.Address() self.__parse_level(sub_state, self.parse_addr_tbl, self.__ignore) state.msg += sub_state.msg - - text = addr.get_street() - if not (addr.get_city() or addr.get_state() or - addr.get_postal_code() or addr.get_country()): - match = ADDR_RE.match(text) - if match: - groups = match.groups() - addr.set_street(groups[0].strip()) - addr.set_city(groups[2].strip()) - addr.set_state(groups[3].strip()) - addr.set_postal_code(groups[4].strip()) - addr.set_country(groups[5].strip()) - - match = ADDR2_RE.match(text) - if match: - groups = match.groups() - addr.set_street(groups[0].strip()) - addr.set_city(groups[2].strip()) - addr.set_state(groups[3].strip()) - addr.set_postal_code(groups[4].strip()) - - match = ADDR3_RE.match(text) - if match: - groups = match.groups() - addr.set_street(groups[0].strip()) - addr.set_city(groups[2].strip()) - addr.set_state(groups[3].strip()) - - state.repo.add_address(addr) + self.__merge_address(free_form, sub_state.addr, line, state) + state.repo.add_address(sub_state.addr) def __repo_phon(self, line, state): """ @@ -6466,22 +6507,6 @@ class GedcomParser(UpdateCallback): url.set_type(gen.lib.UrlType(gen.lib.UrlType.EMAIL)) state.repo.add_url(url) - def __location_addr(self, line, state): - """ - @param line: The current line in GedLine format - @type line: GedLine - @param state: The current state - @type state: CurrentState - """ - if not state.location: - state.location = gen.lib.Location() - val = state.location.get_street() - if val: - val = "%s, %s" % (val, line.data.strip()) - else: - val = line.data.strip() - state.location.set_street(val.replace('\n', ' ')) - def __location_date(self, line, state): """ @param line: The current line in GedLine format @@ -7319,25 +7344,36 @@ class GedcomParser(UpdateCallback): def __subm_addr(self, line, state): """ + Parses the SUBMitter address structure + + n ADDR {0:1} + +1 CONT {0:M} + +1 ADR1 {0:1} (Street) + +1 ADR2 {0:1} (Locality) + +1 CITY {0:1} + +1 STAE {0:1} + +1 POST {0:1} + +1 CTRY {0:1} + @param line: The current line in GedLine format @type line: GedLine @param state: The current state @type state: CurrentState """ + free_form = line.data + sub_state = CurrentState(level=state.level + 1) - sub_state.location = gen.lib.Location() - sub_state.location.set_street(line.data) + sub_state.location = state.res self.__parse_level(sub_state, self.parse_loc_tbl, self.__undefined) state.msg += sub_state.msg - location = sub_state.location - state.res.set_address(location.get_street()) - state.res.set_locality(location.get_locality()) - state.res.set_city(location.get_city()) - state.res.set_state(location.get_state()) - state.res.set_country(location.get_country()) - state.res.set_postal_code(location.get_postal_code()) + self.__merge_address(free_form, state.res, line, state) + # Researcher is a sub-type of LocationBase, so get_street and set_street + # which are used in routines called from self.parse_loc_tbl work fine. + # Unfortunately, Researcher also has get_address and set_address, so we + # need to copy the street into that. + state.res.set_address(state.res.get_street()) def __subm_phon(self, line, state): """