From 1cada500de65b3125e62eb905e3d5c2a10671cbc Mon Sep 17 00:00:00 2001 From: prculley Date: Wed, 27 Jul 2016 08:22:38 -0500 Subject: [PATCH 1/2] Gedcom import media improvements part 2 This PR supports a variety of changes to Media support, mostly having to do with embedded OBJE tag lines. - Support v5.5.1 OBJE/FORM/MEDI tag on embedded OBJE - Change OBJE/BLOB tag from "not understood" to "Recognized but not supported" - Detect multiple FILEs in a single embedded OBJE >> v5.5.1 feature not supported - Changed OBJE/FORM/TYPE to internationalize the data (same as SourceMediaType) - Corrected XREF/OBJE/FORM to correctly save and use its value (was a typo in original code) and to accept any case. - Added test for missing title on embedded OBJE, and use filename if missing. - Added test for missing FILE on XREF style OBJE, with warning like the embedded OBJE. - Support _PRIM tag to handle primary photo for Legacy Family Tree, Ancestral Quest, Family Origins, MyHeritage Family Tree Builder, and others. - Some of these changes would have required making patches to six different nearly identical pieces of code. So I combined the similar methods and the two functions they called into a single function. - Added function to check for a subordinate line with a specific tag, and report unexpected lines as not understood. Used that function in OBJE/REFN/TYPE and OBJE/_PRIM processing. - Test files imp_FTM_PHOTO.ged and imp_MediaTest.ged have been updated to include testing on these changes. --- gramps/plugins/lib/libgedcom.py | 409 +++++++++++++++++--------------- 1 file changed, 219 insertions(+), 190 deletions(-) diff --git a/gramps/plugins/lib/libgedcom.py b/gramps/plugins/lib/libgedcom.py index 38d6b45db..bf37c4a05 100755 --- a/gramps/plugins/lib/libgedcom.py +++ b/gramps/plugins/lib/libgedcom.py @@ -270,6 +270,7 @@ TOKEN__ADPN = 130 TOKEN__FSFTID = 131 TOKEN__PHOTO = 132 TOKEN__LINK = 133 +TOKEN__PRIM = 134 TOKENS = { "HEAD" : TOKEN_HEAD, "MEDI" : TOKEN_MEDI, @@ -363,7 +364,7 @@ TOKENS = { "_DETAIL" : TOKEN_IGNORE,"_PREF" : TOKEN__PRIMARY, "_LKD" : TOKEN__LKD, "_DATE" : TOKEN_IGNORE, "_SCBK" : TOKEN_IGNORE,"_TYPE" : TOKEN_TYPE, - "_PRIM" : TOKEN_IGNORE,"_SSHOW" : TOKEN_IGNORE, + "_PRIM" : TOKEN__PRIM, "_SSHOW" : TOKEN_IGNORE, "_PAREN" : TOKEN_IGNORE,"BLOB" : TOKEN_BLOB, "CONL" : TOKEN_CONL, "RESN" : TOKEN_RESN, "_MEDI" : TOKEN_MEDI, "_MASTER" : TOKEN_IGNORE, @@ -1605,7 +1606,7 @@ class CurrentState: self.note = None self.lds_ord = None self.msg = "" - self.primary = False # _PRIM tag on an INDI.FAMC tag + self.primary = False # _PRIMARY tag on an INDI.FAMC tag self.filename = "" self.title = "" self.addr = None @@ -2320,10 +2321,12 @@ class GedcomParser(UpdateCallback): self.media_parse_tbl = { TOKEN_FORM : self.__media_ref_form, + TOKEN_MEDI : self.__media_ref_medi, # v5.5.1 TOKEN_TITL : self.__media_ref_titl, TOKEN_FILE : self.__media_ref_file, TOKEN_NOTE : self.__media_ref_note, TOKEN_RNOTE : self.__media_ref_note, + TOKEN__PRIM : self.__media_ref_prim, # LFT etc. TOKEN_IGNORE : self.__ignore, } self.func_list.append(self.media_parse_tbl) @@ -2504,7 +2507,7 @@ class GedcomParser(UpdateCallback): TOKEN_NOTE : self.__obje_note, TOKEN_RNOTE : self.__obje_note, TOKEN_SOUR : self.__obje_sour, - TOKEN_BLOB : self.__obje_blob, + TOKEN_BLOB : self.__ignore, # v5.5.1 deprecated TOKEN_REFN : self.__obje_refn, TOKEN_RIN : self.__obje_rin, TOKEN_CHAN : self.__obje_chan, @@ -3062,6 +3065,34 @@ class GedcomParser(UpdateCallback): self.backoff = False return self.groups + def __chk_subordinate(self, level, state, token): + """ + checks for a single subordinate line with specific token. If any other + lines are present, they are not understood. + + @param level: Current level in the file + @type level: int + @param state: The current state + @type state: CurrentState + @param token: The token to search for + @type token: int + """ + skips = 0 + got_line = None + while True: + line = self.__get_next_line() + if self.__level_is_finished(line, level): + if skips: + # This improves formatting when there are long sequences of + # skipped lines + self.__add_msg("", None, None) + return got_line + if line.token == token: + got_line = line + else: + self.__add_msg(_("Line ignored as not understood"), line, state) + skips += 1 + def __undefined(self, line, state): """ @param line: The current line in GedLine format @@ -3794,39 +3825,12 @@ class GedcomParser(UpdateCallback): def __person_object(self, line, state): """ - - Embedded form - - > n OBJE @@ {1:1} - - Linked form - - > n OBJE {1:1} - > +1 FORM {1:1} - > +1 TITL {0:1} - > +1 FILE {1:1} - > +1 <> {0:M} - @param line: The current line in GedLine format @type line: GedLine @param state: The current state @type state: CurrentState """ - if line.data and line.data[0] == '@': - # Reference to a named multimedia object defined elsewhere - gramps_id = self.oid_map[line.data] - - handle = self.__find_media_handle(gramps_id) - ref = MediaRef() - ref.set_reference_handle(handle) - state.person.add_media_reference(ref) - else: - (form, filename, title, note) = self.__obje(state.level+1, state) - if filename == "": - self.__add_msg(_("Filename omitted"), line, state) - if form == "": - self.__add_msg(_("Form omitted"), line, state) - self.build_media(state.person, form, filename, title, note) + self.__obje(line, state, state.person) def __person_photo(self, line, state): """ @@ -4100,6 +4104,9 @@ class GedcomParser(UpdateCallback): state.person.add_address(addr) self.__skip_subordinate_levels(state.level+1, state) + + + def __person_email(self, line, state): """ O INDI @@ -4739,7 +4746,7 @@ class GedcomParser(UpdateCallback): def __person_famc_primary(self, line, state): """ - Parses the _PRIM tag on an INDI.FAMC tag. This value is stored in + Parses the _PRIMARY tag on an INDI.FAMC tag. This value is stored in the state record to be used later. @param line: The current line in GedLine format @@ -5152,21 +5159,7 @@ class GedcomParser(UpdateCallback): @param state: The current state @type state: CurrentState """ - if line.data and line.data[0] == '@': - # Reference to a named multimedia object defined elsewhere - gramps_id = self.oid_map[line.data] - - handle = self.__find_media_handle(gramps_id) - ref = MediaRef() - ref.set_reference_handle(handle) - state.family.add_media_reference(ref) - else: - (form, filename, title, note) = self.__obje(state.level + 1, state) - if filename == "": - self.__add_msg(_("Filename omitted"), line, state) - if form == "": - self.__add_msg(_("Form omitted"), line, state) - self.build_media(state.family, form, filename, title, note) + self.__obje(line, state, state.family) def __family_comm(self, line, state): """ @@ -5222,30 +5215,118 @@ class GedcomParser(UpdateCallback): attr.set_value(line.data) state.family.add_attribute(attr) - def __obje(self, level, state): + def __obje(self, line, state, pri_obj): """ + Embedded form + + n OBJE @@ {1:1} + +1 _PRIM {0:1} # Indicates primary photo + + Linked form + n OBJE {1:1} - +1 FORM {1:1} + +1 FORM {1:1} # v5.5 layout +1 TITL {0:1} - +1 FILE {1:1} + +1 FILE {1:1} # v5.5.1 allows multiple + +2 FORM {1:1} # v5.5.1 layout + +3 MEDI {0:1} # v5.5.1 layout +1 <> {0:M} + +1 _PRIM {0:1} # Indicates primary photo @param line: The current line in GedLine format @type line: GedLine @param state: The current state @type state: CurrentState + @param pri_obj: The Primary object to which this is attached + @type state: Person # or Family, or Source etc. """ + if line.data and line.data[0] == '@': + # Reference to a named multimedia object defined elsewhere + gramps_id = self.oid_map[line.data] + handle = self.__find_media_handle(gramps_id) + # check to see if this is a primary photo + line = self.__chk_subordinate(state.level+1, state, TOKEN__PRIM) + if line and line.data == 'Y': + state.photo = handle + oref = MediaRef() + oref.set_reference_handle(handle) + pri_obj.add_media_reference(oref) + return + # + # The remainder of this code is similar in concept to __parse_obje + # except that it combines references to the same media file by + # comparing path names. If they are the same, then only the first + # is kept. This does mean that if there are different notes etc. on a + # later OBJE, they will be lost. + # sub_state = CurrentState() sub_state.form = "" + sub_state.attr = None sub_state.filename = "" sub_state.title = "" sub_state.note = "" - sub_state.level = level + sub_state.level = state.level + 1 + sub_state.prim = "" self.__parse_level(sub_state, self.media_parse_tbl, self.__ignore) state.msg += sub_state.msg - return (sub_state.form, sub_state.filename, sub_state.title, - sub_state.note) + if sub_state.filename == "": + self.__add_msg(_("Filename omitted"), line, state) + # The following lines are commented out because Gramps is NOT a + # Gedcom validator! + # if sub_state.form == "": + # self.__add_msg(_("Form omitted"), line, state) + + # The following code that detects URL is an older v5.5 usage; the + # modern option is to use the EMAIL tag. + if isinstance(sub_state.form, str) and sub_state.form == "url": + url = Url() + url.set_path(sub_state.filename) + url.set_description(sub_state.title) + url.set_type(UrlType.WEB_HOME) + pri_obj.add_url(url) + else: + # to allow import of references to URLs (especially for import from + # geni.com), do not try to find the file if it is blatently a URL + res = urlparse(sub_state.filename) + if sub_state.filename != '' and \ + (res.scheme == '' or res.scheme == 'file'): + (valid, path) = self.__find_file(sub_state.filename, + self.dir_path) + if not valid: + self.__add_msg(_("Could not import %s") % + sub_state.filename, line, state) + else: + path = sub_state.filename + # Multiple references to the same media silently drops the later + # ones, even if title, notes etc. are different + photo_handle = self.media_map.get(path) + if photo_handle is None: + photo = MediaObject() + photo.set_path(path) + if sub_state.title: + photo.set_description(sub_state.title) + else: + photo.set_description(path) + full_path = os.path.abspath(path) + if os.path.isfile(full_path): + photo.set_mime_type(get_type(full_path)) + else: + photo.set_mime_type(MIME_MAP.get(sub_state.form, + 'unknown')) + if sub_state.note: + photo.add_note(sub_state.note) + if sub_state.attr: + photo.attribute_list.append(sub_state.attr) + self.dbase.add_object(photo, self.trans) + self.media_map[path] = photo.handle + else: + photo = self.dbase.get_object_from_handle(photo_handle) + if sub_state.prim == "Y": + state.photo = photo.handle + oref = MediaRef() + oref.set_reference_handle(photo.handle) + pri_obj.add_media_reference(oref) def __media_ref_form(self, line, state): """ @@ -5256,7 +5337,22 @@ class GedcomParser(UpdateCallback): @param state: The current state @type state: CurrentState """ - state.form = line.data + state.form = line.data.lower() + + def __media_ref_medi(self, line, state): + """ + +1 MEDI {0:1} (Photo, Audio, Book, etc.) + + @param line: The current line in GedLine format + @type line: GedLine + @param state: The current state + @type state: CurrentState + """ + state.attr = Attribute() + mtype = MEDIA_MAP.get(line.data.lower(), + (SourceMediaType.CUSTOM, line.data)) + state.attr.set_type(_('Media-Type')) + state.attr.set_value(str(SourceMediaType(mtype))) def __media_ref_titl(self, line, state): """ @@ -5278,6 +5374,11 @@ class GedcomParser(UpdateCallback): @param state: The current state @type state: CurrentState """ + if state.filename != "": + self.__add_msg(_("Multiple FILE in a single OBJE ignored"), + line, state) + self.__skip_subordinate_levels(state.level+1, state) + return state.filename = line.data def __media_ref_note(self, line, state): @@ -5323,6 +5424,19 @@ class GedcomParser(UpdateCallback): self.dbase.commit_note(new_note, self.trans, new_note.change) state.note = new_note.get_handle() + def __media_ref_prim(self, line, state): + """ + +1 _PRIM {0:1} + + Indicates that this OBJE is the primary photo. + + @param line: The current line in GedLine format + @type line: GedLine + @param state: The current state + @type state: CurrentState + """ + state.prim = line.data + def __family_adopt(self, line, state): """ n ADOP @@ -5380,21 +5494,7 @@ class GedcomParser(UpdateCallback): @param state: The current state @type state: CurrentState """ - if line.data and line.data[0] == '@': - # Reference to a named multimedia object defined elsewhere - gramps_id = self.oid_map[line.data] - - handle = self.__find_media_handle(gramps_id) - ref = MediaRef() - ref.set_reference_handle(handle) - state.event.add_media_reference(ref) - else: - (form, filename, title, note) = self.__obje(state.level + 1, state) - if filename == "": - self.__add_msg(_("Filename omitted"), line, state) - if form == "": - self.__add_msg(_("Form omitted"), line, state) - self.build_media(state.event, form, filename, title, note) + self.__obje(line, state, state.event) def __event_type(self, line, state): """ @@ -5513,22 +5613,7 @@ class GedcomParser(UpdateCallback): @param state: The current state @type state: CurrentState """ - if line.data and line.data[0] == '@': - # Reference to a named multimedia object defined elsewhere - gramps_id = self.oid_map[line.data] - - handle = self.__find_media_handle(gramps_id) - ref = MediaRef() - ref.set_reference_handle(handle) - state.place.add_media_reference(ref) - else: - # FIXME this should probably be level+1 - (form, filename, title, note) = self.__obje(state.level, state) - if filename == "": - self.__add_msg(_("Filename omitted"), line, state) - if form == "": - self.__add_msg(_("Form omitted"), line, state) - self.build_media(state.place, form, filename, title, note) + self.__obje(line, state, state.place) def __event_place_sour(self, line, state): """ @@ -6200,21 +6285,7 @@ class GedcomParser(UpdateCallback): @param state: The current state @type state: CurrentState """ - if line.data and line.data[0] == '@': - # Reference to a named multimedia object defined elsewhere - gramps_id = self.oid_map[line.data] - - handle = self.__find_media_handle(gramps_id) - ref = MediaRef() - ref.set_reference_handle(handle) - state.citation.add_media_reference(ref) - else: - (form, filename, title, note) = self.__obje(state.level+1, state) - if filename == "": - self.__add_msg(_("Filename omitted"), line, state) - if form == "": - self.__add_msg(_("Form omitted"), line, state) - self.build_media(state.citation, form, filename, title, note) + self.__obje(line, state, state.citation) def __citation_refn(self, line, state): """ @@ -6353,21 +6424,7 @@ class GedcomParser(UpdateCallback): @param state: The current state @type state: CurrentState """ - if line.data and line.data[0] == '@': - # Reference to a named multimedia object defined elsewhere - gramps_id = self.oid_map[line.data] - - handle = self.__find_media_handle(gramps_id) - ref = MediaRef() - ref.set_reference_handle(handle) - state.source.add_media_reference(ref) - else: - (form, filename, title, note) = self.__obje(state.level+1, state) - if filename == "": - self.__add_msg(_("Filename omitted"), line, state) - if form == "": - self.__add_msg(_("Form omitted"), line, state) - self.build_media(state.source, form, filename, title, note) + self.__obje(line, state, state.source) def __source_chan(self, line, state): """ @@ -6594,6 +6651,8 @@ class GedcomParser(UpdateCallback): self.__parse_level(state, self.obje_func, self.__undefined) + if state.media.get_path() == "": + self.__add_msg(_("Filename omitted"), line, state) # Add the default reference if no source has found self.__add_default_source(media) @@ -6612,9 +6671,7 @@ class GedcomParser(UpdateCallback): @param state: The current state @type state: CurrentState """ - # TODO: FIX THIS!!! - state.media_form = line.data.strip() - self.__skip_subordinate_levels(state.level+1, state) + state.form = line.data.lower().strip() def __obje_file(self, line, state): """ @@ -6633,7 +6690,7 @@ class GedcomParser(UpdateCallback): res = urlparse(line.data) if line.data != '' and (res.scheme == '' or res.scheme == 'file'): (file_ok, filename) = self.__find_file(line.data, self.dir_path) - if state.media != "URL": + if state.form != "url": # Might not work if FORM doesn't precede FILE if not file_ok: self.__add_msg(_("Could not import %s") % filename, line, state) @@ -6699,16 +6756,6 @@ class GedcomParser(UpdateCallback): """ state.media.add_citation(self.handle_source(line, state.level, state)) - def __obje_blob(self, line, state): - """ - @param line: The current line in GedLine format - @type line: GedLine - @param state: The current state - @type state: CurrentState - """ - self.__add_msg(_("BLOB ignored"), line, state) - self.__skip_subordinate_levels(state.level+1, state) - def __obje_refn(self, line, state): """ @param line: The current line in GedLine format @@ -6720,29 +6767,34 @@ class GedcomParser(UpdateCallback): attr.set_type(line.token_text) # Atrribute : REFN attr.set_value(line.data) # if there is a subsequent TYPE, we add it as a note to the attribute - while True: - line = self.__get_next_line() - if self.__level_is_finished(line, state.level+1): - break - elif line.token == TOKEN_TYPE: - new_note = Note(line.data) - new_note.set_gramps_id(self.nid_map[""]) - new_note.set_handle(create_id()) - new_note.set_type('REFN-TYPE') - self.dbase.commit_note(new_note, self.trans, new_note.change) - attr.add_note(new_note.get_handle()) + line = self.__chk_subordinate(state.level+1, state, TOKEN_TYPE) + if line: + new_note = Note(line.data) + new_note.set_gramps_id(self.nid_map[""]) + new_note.set_handle(create_id()) + new_note.set_type('REFN-TYPE') + self.dbase.commit_note(new_note, self.trans, new_note.change) + attr.add_note(new_note.get_handle()) state.media.attribute_list.append(attr) def __obje_type(self, line, state): """ + +1 FILE {1:M} + +2 FORM {1:1} + +3 TYPE {0:1} # v5.5.1 + + Source_Media_type is one of (Photo, Audio, Book, etc.) + @param line: The current line in GedLine format @type line: GedLine @param state: The current state @type state: CurrentState """ attr = Attribute() + mtype = MEDIA_MAP.get(line.data.lower(), + (SourceMediaType.CUSTOM, line.data)) attr.set_type(_('Media-Type')) - attr.set_value(line.data) # (Photo, Audio, Book, etc.) + attr.set_value(str(SourceMediaType(mtype))) state.media.attribute_list.append(attr) def __obje_rin(self, line, state): @@ -7675,45 +7727,6 @@ class GedcomParser(UpdateCallback): # too far in the future, this gives OverflowError. pass - def build_media(self, obj, form, filename, title, note): - if isinstance(form, str) and form.lower() == "url": - url = Url() - url.set_path(filename) - url.set_description(title) - url.set_type(UrlType.WEB_HOME) - obj.add_url(url) - else: - # to allow import of references to URLs (especially for import from - # geni.com), do not try to find the files if they are blatently URLs - res = urlparse(filename) - if filename != '' and (res.scheme == '' or res.scheme == 'file'): - (valid, path) = self.__find_file(filename, self.dir_path) - if not valid: - self.__add_msg(_("Could not import %s") % filename) - else: - path = filename - # Multiple references to the same media silently drops the later - # ones, even if title, notes etc. are different - photo_handle = self.media_map.get(path) - if photo_handle is None: - photo = MediaObject() - photo.set_path(path) - photo.set_description(title) - full_path = os.path.abspath(path) - if os.path.isfile(full_path): - photo.set_mime_type(get_type(full_path)) - else: - photo.set_mime_type(MIME_MAP.get(form.lower(), 'unknown')) - if note: - photo.add_note(note) - self.dbase.add_object(photo, self.trans) - self.media_map[path] = photo.handle - else: - photo = self.dbase.get_object_from_handle(photo_handle) - oref = MediaRef() - oref.set_reference_handle(photo.handle) - obj.add_media_reference(oref) - def __build_event_pair(self, state, event_type, event_map, description): """ n TYPE {0:1} p.* @@ -7783,26 +7796,42 @@ class GedcomParser(UpdateCallback): def __do_photo(self, state): """ Choose the primary photo from the list of media present for this - person. Supports FTM _PHOTO feature. + person. Supports FTM _PHOTO. and others _PRIM feature. 0 INDI +1 _PHOTO @@ {1:1} + + 0 INDI + +1 OBJE @@ + +2 _PRIM + + 0 INDI + +1 OBJE + +2 FILE primary_photo.jpg + +2 _PRIM + + For the _PHOTO varient, state.photo contains the XREF ('@M1@'). + For the _PRIM varients, state.photo contains the handle. Since Gramps currently uses the first media in the list as the primary, find the primary photo if already in the list, if present, move to beginning. If not present, add at the beginning. This is run after all of the person processing is complete but before committing the person. """ - if state.photo: + if state.photo.startswith('@'): gramps_id = self.oid_map[state.photo] handle = self.__find_media_handle(gramps_id) - for mref in state.person.media_list: - if handle == mref.ref: - state.person.media_list.remove(mref) - state.person.media_list.insert(0, mref) - return - mref = MediaRef() - mref.set_reference_handle(handle) - state.person.media_list.insert(0, mref) + elif state.photo: + handle = state.photo + else: + return + for mref in state.person.media_list: + if handle == mref.ref: + state.person.media_list.remove(mref) + state.person.media_list.insert(0, mref) + return + mref = MediaRef() + mref.set_reference_handle(handle) + state.person.media_list.insert(0, mref) def __extract_temple(self, line): def get_code(code): From 43c1f24c2db921f4fc8832377c9dc26b0537431e Mon Sep 17 00:00:00 2001 From: prculley Date: Thu, 4 Aug 2016 11:20:19 -0500 Subject: [PATCH 2/2] Fix whitespace --- gramps/plugins/lib/libgedcom.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/gramps/plugins/lib/libgedcom.py b/gramps/plugins/lib/libgedcom.py index bf37c4a05..6a088f860 100755 --- a/gramps/plugins/lib/libgedcom.py +++ b/gramps/plugins/lib/libgedcom.py @@ -4104,9 +4104,6 @@ class GedcomParser(UpdateCallback): state.person.add_address(addr) self.__skip_subordinate_levels(state.level+1, state) - - - def __person_email(self, line, state): """ O INDI