Skip to content

Commit 4aa78bd

Browse files
authored
Merge pull request #78 from Georgetown-University-Libraries/sd1320
Refine EAD to DC mapping
2 parents 3e8d983 + 61101a5 commit 4aa78bd

2 files changed

Lines changed: 192 additions & 159 deletions

File tree

  • dspace/src/main
    • edu/georgetown/library/fileAnalyzer/importer
    • resources/edu/georgetown/library/fileAnalyzer
Lines changed: 154 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -1,152 +1,154 @@
1-
package edu.georgetown.library.fileAnalyzer.importer;
2-
3-
import java.io.File;
4-
import java.io.IOException;
5-
import java.text.ParseException;
6-
import java.text.SimpleDateFormat;
7-
import java.util.Date;
8-
import java.util.HashMap;
9-
import java.util.TreeMap;
10-
import java.util.Vector;
11-
import java.util.regex.Pattern;
12-
13-
import javax.xml.transform.TransformerException;
14-
15-
import org.w3c.dom.Document;
16-
import org.xml.sax.SAXException;
17-
18-
19-
import gov.nara.nwts.ftapp.ActionResult;
20-
import gov.nara.nwts.ftapp.FTDriver;
21-
import gov.nara.nwts.ftapp.Timer;
22-
import gov.nara.nwts.ftapp.ftprop.FTPropString;
23-
import gov.nara.nwts.ftapp.importer.DefaultImporter;
24-
import gov.nara.nwts.ftapp.importer.DelimitedFileReader;
25-
import gov.nara.nwts.ftapp.stats.Stats;
26-
import gov.nara.nwts.ftapp.stats.StatsGenerator;
27-
import gov.nara.nwts.ftapp.stats.StatsItem;
28-
import gov.nara.nwts.ftapp.stats.StatsItemConfig;
29-
import gov.nara.nwts.ftapp.stats.StatsItemEnum;
30-
import edu.georgetown.library.fileAnalyzer.util.XMLUtil;
31-
32-
/**
33-
* Importer for tab delimited files
34-
*
35-
* @author TBrady
36-
*
37-
*/
38-
public class EAD2DC extends DefaultImporter {
39-
40-
public static enum EAD2DCStatsItems implements StatsItemEnum {
41-
Record(StatsItem.makeStringStatsItem("Record", 100).setExport(false));
42-
43-
StatsItem si;
44-
45-
EAD2DCStatsItems(StatsItem si) {
46-
this.si = si;
47-
}
48-
49-
public StatsItem si() {
50-
return si;
51-
}
52-
}
53-
54-
public static enum Generator implements StatsGenerator {
55-
INSTANCE;
56-
public Stats create(String key) {
57-
return new Stats(details, key);
58-
}
59-
}
60-
61-
public static StatsItemConfig details = StatsItemConfig
62-
.create(EAD2DCStatsItems.class);
63-
public static String P_COLL = "Collection";
64-
public static String P_RIGHTS = "RIGHTS";
65-
public static String P_REFCOL = "refid-column-name";
66-
67-
68-
public EAD2DC(FTDriver dt) {
69-
super(dt);
70-
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(),
71-
P_COLL, P_COLL,
72-
"DSpace Collection Handle",""));
73-
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(),
74-
P_RIGHTS, P_RIGHTS,
75-
"dc.rights statement",""));
76-
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(),
77-
P_REFCOL, P_REFCOL,
78-
"Metadata registry field name to store the archival object refid","gu.archivesspace.id"));
79-
}
80-
81-
public String toString() {
82-
return "EAD to DSpace Dublin Core";
83-
}
84-
85-
public String getDescription() {
86-
return "This rule will take an exported EAD file and convert archival objects to dublin core metadata.";
87-
}
88-
89-
public String getShortName() {
90-
return "EAD2DC";
91-
}
92-
93-
public ActionResult importFile(File selectedFile) throws IOException {
94-
details = StatsItemConfig.create(EAD2DCStatsItems.class);
95-
HashMap<String, Object> params = new HashMap<>();
96-
params.put("collection", this.getProperty(P_COLL));
97-
params.put("rights", this.getProperty(P_RIGHTS));
98-
params.put("refcol", this.getProperty(P_REFCOL));
99-
Timer timer = new Timer();
100-
TreeMap<String, Stats> types = new TreeMap<String, Stats>();
101-
102-
try {
103-
Document d = XMLUtil.db_ns.parse(selectedFile);
104-
File csv = new File(selectedFile.getParent(), selectedFile.getName()+".csv");
105-
XMLUtil.doTransform(d, csv, "edu/georgetown/library/fileAnalyzer/ead.xsl", params);
106-
DelimitedFileReader dfr = new DelimitedFileReader(csv, ",");
107-
Vector<String> header = dfr.getRow();
108-
for(String col: header) {
109-
details.addStatsItem(col, StatsItem.makeStringStatsItem(col));
110-
}
111-
int rownum = 1_000_000;
112-
for(Vector<String>row=dfr.getRow(); row!=null; row=dfr.getRow()) {
113-
String key = ""+rownum++;
114-
Stats stats = Generator.INSTANCE.create(key);
115-
types.put(key, stats);
116-
for(int i=0; i<header.size(); i++) {
117-
String s = row.size() > i ? row.get(i) : "";
118-
String col = header.get(i);
119-
if (col.equals("dc.date.created[en]")) {
120-
s = normalizeDate(s);
121-
}
122-
stats.appendKeyVal(details.getByKey(col), s);
123-
}
124-
}
125-
} catch (SAXException e) {
126-
e.printStackTrace();
127-
} catch (TransformerException e) {
128-
e.printStackTrace();
129-
}
130-
return new ActionResult(selectedFile, "EAD2DC",
131-
this.toString(), details, types, true, timer.getDuration());
132-
}
133-
134-
public String normalizeDate(String s) {
135-
if (Pattern.matches("^\\d\\d\\d\\d(-\\d\\d(-\\d\\d)?)?", s)) {
136-
return s;
137-
}
138-
try {
139-
Date d = new SimpleDateFormat("DD MMM yyyy").parse(s);
140-
return new SimpleDateFormat("yyyy-MM-DD").format(d);
141-
} catch (ParseException e1) {
142-
// no action
143-
}
144-
try {
145-
Date d = new SimpleDateFormat("MMM yyyy").parse(s);
146-
return new SimpleDateFormat("yyyy-MM").format(d);
147-
} catch (ParseException e1) {
148-
// no action
149-
}
150-
return s;
151-
}
152-
}
1+
package edu.georgetown.library.fileAnalyzer.importer;
2+
3+
import java.io.File;
4+
import java.io.IOException;
5+
import java.io.InputStream;
6+
import java.text.ParseException;
7+
import java.text.SimpleDateFormat;
8+
import java.util.Date;
9+
import java.util.HashMap;
10+
import java.util.TreeMap;
11+
import java.util.Vector;
12+
import java.util.regex.Pattern;
13+
14+
import javax.xml.transform.TransformerException;
15+
16+
import org.w3c.dom.Document;
17+
import org.xml.sax.SAXException;
18+
19+
20+
import gov.nara.nwts.ftapp.ActionResult;
21+
import gov.nara.nwts.ftapp.FTDriver;
22+
import gov.nara.nwts.ftapp.Timer;
23+
import gov.nara.nwts.ftapp.ftprop.FTPropString;
24+
import gov.nara.nwts.ftapp.importer.DefaultImporter;
25+
import gov.nara.nwts.ftapp.importer.DelimitedFileReader;
26+
import gov.nara.nwts.ftapp.stats.Stats;
27+
import gov.nara.nwts.ftapp.stats.StatsGenerator;
28+
import gov.nara.nwts.ftapp.stats.StatsItem;
29+
import gov.nara.nwts.ftapp.stats.StatsItemConfig;
30+
import gov.nara.nwts.ftapp.stats.StatsItemEnum;
31+
import edu.georgetown.library.fileAnalyzer.util.XMLUtil;
32+
33+
/**
34+
* Importer for tab delimited files
35+
*
36+
* @author TBrady
37+
*
38+
*/
39+
public class EAD2DC extends DefaultImporter {
40+
41+
public static enum EAD2DCStatsItems implements StatsItemEnum {
42+
Record(StatsItem.makeStringStatsItem("Record", 100).setExport(false));
43+
44+
StatsItem si;
45+
46+
EAD2DCStatsItems(StatsItem si) {
47+
this.si = si;
48+
}
49+
50+
public StatsItem si() {
51+
return si;
52+
}
53+
}
54+
55+
public static enum Generator implements StatsGenerator {
56+
INSTANCE;
57+
public Stats create(String key) {
58+
return new Stats(details, key);
59+
}
60+
}
61+
62+
public static StatsItemConfig details = StatsItemConfig
63+
.create(EAD2DCStatsItems.class);
64+
public static String P_COLL = "Collection";
65+
public static String P_RIGHTS = "RIGHTS";
66+
public static String P_REFCOL = "refid-column-name";
67+
68+
69+
public EAD2DC(FTDriver dt) {
70+
super(dt);
71+
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(),
72+
P_COLL, P_COLL,
73+
"DSpace Collection Handle",""));
74+
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(),
75+
P_RIGHTS, P_RIGHTS,
76+
"dc.rights statement",""));
77+
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(),
78+
P_REFCOL, P_REFCOL,
79+
"Metadata registry field name to store the archival object refid","gu.archivesspace.id"));
80+
}
81+
82+
public String toString() {
83+
return "EAD to DSpace Dublin Core";
84+
}
85+
86+
public String getDescription() {
87+
return "This rule will take an exported EAD file and convert archival objects to dublin core metadata.";
88+
}
89+
90+
public String getShortName() {
91+
return "EAD2DC";
92+
}
93+
94+
public ActionResult importFile(File selectedFile) throws IOException {
95+
details = StatsItemConfig.create(EAD2DCStatsItems.class);
96+
HashMap<String, Object> params = new HashMap<>();
97+
params.put("collection", this.getProperty(P_COLL));
98+
params.put("rights", this.getProperty(P_RIGHTS));
99+
params.put("refcol", this.getProperty(P_REFCOL));
100+
Timer timer = new Timer();
101+
TreeMap<String, Stats> types = new TreeMap<String, Stats>();
102+
103+
try {
104+
Document d = XMLUtil.db_ns.parse(selectedFile);
105+
File csv = new File(selectedFile.getParent(), selectedFile.getName()+".csv");
106+
InputStream in = XMLUtil.getResourceStream(this, "edu/georgetown/library/fileAnalyzer/ead.xsl");
107+
XMLUtil.doTransform(d, csv, in, params);
108+
DelimitedFileReader dfr = new DelimitedFileReader(csv, ",");
109+
Vector<String> header = dfr.getRow();
110+
for(String col: header) {
111+
details.addStatsItem(col, StatsItem.makeStringStatsItem(col));
112+
}
113+
int rownum = 1_000_000;
114+
for(Vector<String>row=dfr.getRow(); row!=null; row=dfr.getRow()) {
115+
String key = ""+rownum++;
116+
Stats stats = Generator.INSTANCE.create(key);
117+
types.put(key, stats);
118+
for(int i=0; i<header.size(); i++) {
119+
String s = row.size() > i ? row.get(i) : "";
120+
String col = header.get(i);
121+
if (col.equals("dc.date.created[en]")) {
122+
s = normalizeDate(s);
123+
}
124+
stats.appendKeyVal(details.getByKey(col), s);
125+
}
126+
}
127+
} catch (SAXException e) {
128+
e.printStackTrace();
129+
} catch (TransformerException e) {
130+
e.printStackTrace();
131+
}
132+
return new ActionResult(selectedFile, "EAD2DC",
133+
this.toString(), details, types, true, timer.getDuration());
134+
}
135+
136+
public String normalizeDate(String s) {
137+
if (Pattern.matches("^\\d\\d\\d\\d(-\\d\\d(-\\d\\d)?)?", s)) {
138+
return s;
139+
}
140+
try {
141+
Date d = new SimpleDateFormat("DD MMM yyyy").parse(s);
142+
return new SimpleDateFormat("yyyy-MM-DD").format(d);
143+
} catch (ParseException e1) {
144+
// no action
145+
}
146+
try {
147+
Date d = new SimpleDateFormat("MMM yyyy").parse(s);
148+
return new SimpleDateFormat("yyyy-MM").format(d);
149+
} catch (ParseException e1) {
150+
// no action
151+
}
152+
return s;
153+
}
154+
}

0 commit comments

Comments
 (0)