<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><meta http-equiv="content-type" content="text/html; charset=utf-8" /><style type="text/css"><!--
#msg dl { border: 1px #006 solid; background: #369; padding: 6px; color: #fff; }
#msg dt { float: left; width: 6em; font-weight: bold; }
#msg dt:after { content:':';}
#msg dl, #msg dt, #msg ul, #msg li, #header, #footer { font-family: verdana,arial,helvetica,sans-serif; font-size: 10pt; }
#msg dl a { font-weight: bold}
#msg dl a:link { color:#fc3; }
#msg dl a:active { color:#ff0; }
#msg dl a:visited { color:#cc6; }
h3 { font-family: verdana,arial,helvetica,sans-serif; font-size: 10pt; font-weight: bold; }
#msg pre, #msg p { overflow: auto; background: #ffc; border: 1px #fc0 solid; padding: 6px; }
#msg ul { overflow: auto; }
#header, #footer { color: #fff; background: #636; border: 1px #300 solid; padding: 6px; }
#patch { width: 100%; }
#patch h4 {font-family: verdana,arial,helvetica,sans-serif;font-size:10pt;padding:8px;background:#369;color:#fff;margin:0;}
#patch .propset h4, #patch .binary h4 {margin:0;}
#patch pre {padding:0;line-height:1.2em;margin:0;}
#patch .diff {width:100%;background:#eee;padding: 0 0 10px 0;overflow:auto;}
#patch .propset .diff, #patch .binary .diff {padding:10px 0;}
#patch span {display:block;padding:0 10px;}
#patch .modfile, #patch .addfile, #patch .delfile, #patch .propset, #patch .binary, #patch .copfile {border:1px solid #ccc;margin:10px 0;}
#patch ins {background:#dfd;text-decoration:none;display:block;padding:0 10px;}
#patch del {background:#fdd;text-decoration:none;display:block;padding:0 10px;}
#patch .lines, .info {color:#888;background:#fff;}
--></style>
<title>[560] blacklight_importer/src/marcoverride:
Updates to support more permissive reading</title>
</head>
<body>
<div id="msg">
<dl>
<dt>Revision</dt> <dd>560</dd>
<dt>Author</dt> <dd>haschart</dd>
<dt>Date</dt> <dd>2008-05-14 17:42:42 -0400 (Wed, 14 May 2008)</dd>
</dl>
<h3>Log Message</h3>
<pre>Updates to support more permissive reading</pre>
<h3>Modified Paths</h3>
<ul>
<li><a href="#blacklight_importersrcmarcoverrideMarcDirStreamReaderjava">blacklight_importer/src/marcoverride/MarcDirStreamReader.java</a></li>
</ul>
<h3>Added Paths</h3>
<ul>
<li><a href="#blacklight_importersrcmarcoverrideMarcPermissiveStreamReaderjava">blacklight_importer/src/marcoverride/MarcPermissiveStreamReader.java</a></li>
</ul>
</div>
<div id="patch">
<h3>Diff</h3>
<a id="blacklight_importersrcmarcoverrideMarcDirStreamReaderjava"></a>
<div class="modfile"><h4>Modified: blacklight_importer/src/marcoverride/MarcDirStreamReader.java (559 => 560)</h4>
<pre class="diff"><span>
<span class="info">--- blacklight_importer/src/marcoverride/MarcDirStreamReader.java        2008-05-14 21:42:06 UTC (rev 559)
+++ blacklight_importer/src/marcoverride/MarcDirStreamReader.java        2008-05-14 21:42:42 UTC (rev 560)
</span><span class="lines">@@ -15,20 +15,32 @@
</span><span class="cx"> File list[];
</span><span class="cx"> MarcReader curFileReader;
</span><span class="cx"> int curFileNum;
</span><ins>+ boolean permissive;
</ins><span class="cx">
</span><span class="cx"> public MarcDirStreamReader(String dirName)
</span><span class="cx"> {
</span><span class="cx"> File dir = new File(dirName);
</span><del>- init(dir);
</del><ins>+ init(dir, false);
</ins><span class="cx"> }
</span><span class="cx">
</span><span class="cx"> public MarcDirStreamReader(File dir)
</span><span class="cx"> {
</span><del>- init(dir);
</del><ins>+ init(dir, false);
</ins><span class="cx"> }
</span><span class="cx">
</span><del>- private void init(File dir)
</del><ins>+ public MarcDirStreamReader(String dirName, boolean permissive)
</ins><span class="cx"> {
</span><ins>+ File dir = new File(dirName);
+ init(dir, permissive);
+ }
+
+ public MarcDirStreamReader(File dir, boolean permissive)
+ {
+ init(dir, permissive);
+ }
+
+ private void init(File dir, boolean permissive)
+ {
</ins><span class="cx"> FilenameFilter filter = new FilenameFilter()
</span><span class="cx"> {
</span><span class="cx"> public boolean accept(File dir, String name)
</span><span class="lines">@@ -36,6 +48,7 @@
</span><span class="cx"> return(name.endsWith("mrc"));
</span><span class="cx"> }
</span><span class="cx"> };
</span><ins>+ this.permissive = permissive;
</ins><span class="cx"> list = dir.listFiles(filter);
</span><span class="cx"> java.util.Arrays.sort(list);
</span><span class="cx"> curFileNum = 0;
</span><span class="lines">@@ -58,7 +71,7 @@
</span><span class="cx"> try
</span><span class="cx"> {
</span><span class="cx"> System.err.println("Switching to input file: "+ list[curFileNum]);
</span><del>- curFileReader = new MarcStreamReader(new FileInputStream(list[curFileNum++]));
</del><ins>+ curFileReader = new MarcPermissiveStreamReader(new FileInputStream(list[curFileNum++]), permissive);
</ins><span class="cx"> }
</span><span class="cx"> catch (FileNotFoundException e)
</span><span class="cx"> {
</span></span></pre></div>
<a id="blacklight_importersrcmarcoverrideMarcPermissiveStreamReaderjava"></a>
<div class="addfile"><h4>Added: blacklight_importer/src/marcoverride/MarcPermissiveStreamReader.java (0 => 560)</h4>
<pre class="diff"><span>
<span class="info">--- blacklight_importer/src/marcoverride/MarcPermissiveStreamReader.java         (rev 0)
+++ blacklight_importer/src/marcoverride/MarcPermissiveStreamReader.java        2008-05-14 21:42:42 UTC (rev 560)
</span><span class="lines">@@ -0,0 +1,895 @@
</span><ins>+// $Id: MarcStreamReader.java,v 1.10 2006/12/04 18:45:44 bpeters Exp $
+/**
+ * Copyright (C) 2004 Bas Peters
+ *
+ * This file is part of MARC4J
+ *
+ * MARC4J is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * MARC4J is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with MARC4J; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package marcoverride;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+
+import org.marc4j.Constants;
+import org.marc4j.MarcException;
+import org.marc4j.MarcReader;
+import org.marc4j.converter.CharConverter;
+import org.marc4j.converter.impl.AnselToUnicode;
+import org.marc4j.converter.impl.Iso5426ToUnicode;
+import org.marc4j.marc.ControlField;
+import org.marc4j.marc.DataField;
+import org.marc4j.marc.Leader;
+import org.marc4j.marc.MarcFactory;
+import org.marc4j.marc.Record;
+import org.marc4j.marc.Subfield;
+import org.marc4j.marc.VariableField;
+import org.marc4j.marc.impl.Verifier;
+
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+
+/**
+ * An iterator over a collection of MARC records in ISO 2709 format.
+ * <p>
+ * Example usage:
+ *
+ * <pre>
+ * InputStream input = new FileInputStream(&quot;file.mrc&quot;);
+ * MarcReader reader = new MarcStreamReader(input);
+ * while (reader.hasNext()) {
+ * Record record = reader.next();
+ * // Process record
+ * }
+ * </pre>
+ *
+ * <p>
+ * Check the {@link org.marc4j.marc}&nbsp;package for examples about the use of
+ * the {@link org.marc4j.marc.Record}&nbsp;object model.
+ * </p>
+ *
+ * <p>
+ * When no encoding is given as an constructor argument the parser tries to
+ * resolve the encoding by looking at the character coding scheme (leader
+ * position 9) in MARC21 records. For UNIMARC records this position is not
+ * defined.
+ * </p>
+ *
+ * @author Bas Peters
+ * @version $Revision: 1.10 $
+ *
+ */
+public class MarcPermissiveStreamReader implements MarcReader {
+
+ private DataInputStream input = null;
+
+ private Record record;
+
+ private MarcFactory factory;
+
+ private String encoding = "ISO8859_1";
+
+ private boolean override = false;
+
+ private boolean hasNext = true;
+
+ private boolean permissive = false;
+
+ private CharConverter converterAnsel = null;
+
+ private CharConverter converterUnimarc = null;
+
+ private String conversionCheck1 = null;
+
+ private String conversionCheck2 = null;
+
+ private String conversionCheck3 = null;
+
+ private static HashMap<String, String> langMap = null;
+
+ /**
+ * Constructs an instance with the specified input stream.
+ */
+ public MarcPermissiveStreamReader(InputStream input, boolean permissive) {
+ this(input, null, permissive);
+ }
+
+ /**
+ * Constructs an instance with the specified input stream.
+ */
+ public MarcPermissiveStreamReader(InputStream input, String encoding) {
+ this(input, encoding, true);
+ }
+
+ /**
+ * Constructs an instance with the specified input stream and character
+ * encoding.
+ */
+ public MarcPermissiveStreamReader(InputStream input, String encoding, boolean permissive) {
+ this.permissive = permissive;
+ this.input = new DataInputStream(new BufferedInputStream(input));
+ factory = MarcFactory.newInstance();
+ if (encoding != null) {
+ this.encoding = encoding;
+ override = true;
+ }
+ }
+
+ /**
+ * Returns true if the iteration has more records, false otherwise.
+ */
+ public boolean hasNext() {
+ try {
+ if (input.available() == 0)
+ return false;
+ } catch (IOException e) {
+ throw new MarcException(e.getMessage(), e);
+ }
+ return true;
+ }
+
+ /**
+ * Returns the next record in the iteration.
+ *
+ * @return Record - the record object
+ */
+ public Record next()
+ {
+ record = factory.newRecord();
+
+ try {
+
+ byte[] byteArray = new byte[24];
+ input.readFully(byteArray);
+
+ int recordLength = parseRecordLength(byteArray);
+ byte[] recordBuf = new byte[recordLength - 24];
+ if (permissive)
+ {
+ input.mark(recordLength * 2);
+ input.readFully(recordBuf);
+ if (recordBuf[recordBuf.length-1] != Constants.RT)
+ {
+ recordBuf = rereadPermissively(input, recordBuf, recordLength);
+ recordLength = recordBuf.length + 24;
+ }
+ }
+ else
+ {
+ input.readFully(recordBuf);
+ }
+ String tmp = new String(recordBuf);
+ parseRecord(record, byteArray, recordBuf, recordLength);
+ return(record);
+ }
+ catch (EOFException e) {
+ throw new MarcException("Premature end of file encountered", e);
+ }
+ catch (IOException e) {
+ throw new MarcException("an error occured reading input", e);
+ }
+ }
+
+ private byte[] rereadPermissively(DataInputStream input, byte[] recordBuf, int recordLength) throws IOException
+ {
+ int loc = arrayContainsAt(recordBuf, Constants.RT);
+ if (loc != -1) // stated record length is too long
+ {
+ recordLength = loc + 24;
+ input.reset();
+ recordBuf = new byte[recordLength - 24];
+ input.readFully(recordBuf);
+ }
+ else // stated record length is too short read ahead
+ {
+ loc = recordLength - 24;
+ int c = 0;
+ do
+ {
+ c = input.read();
+ loc++;
+ } while (loc < recordLength + 100 && c != Constants.RT && c != -1);
+
+ if (c == Constants.RT)
+ {
+ recordLength = loc + 24;
+ input.reset();
+ recordBuf = new byte[recordLength - 24];
+ input.readFully(recordBuf);
+ }
+ else if (c == -1)
+ {
+ recordLength = loc + 24;
+ input.reset();
+ recordBuf = new byte[recordLength - 24 + 1];
+ input.readFully(recordBuf);
+ recordBuf[recordBuf.length-1] = Constants.RT;
+ }
+ }
+ return(recordBuf);
+ }
+
+ private void parseRecord(Record record, byte[] byteArray, byte[] recordBuf, int recordLength)
+ {
+ Leader ldr;
+ ldr = factory.newLeader();
+ ldr.setRecordLength(recordLength);
+ int directoryLength=0;
+ conversionCheck1 = "";
+ conversionCheck2 = "";
+ conversionCheck3 = "";
+
+ try {
+ parseLeader(ldr, byteArray);
+ directoryLength = ldr.getBaseAddressOfData() - (24 + 1);
+ }
+ catch (IOException e) {
+ throw new MarcException("error parsing leader with data: "
+ + new String(byteArray), e);
+ }
+ catch (MarcException e) {
+ if (permissive)
+ {
+ if (recordBuf[recordBuf.length-1] == Constants.RT && recordBuf[recordBuf.length-2] == Constants.FT)
+ {
+ System.err.println("Warning: Corrupt record encountered, attempting to read permissively");
+ // make an attempt to recover record.
+ int offset = 0;
+ while (offset < recordBuf.length)
+ {
+ if (recordBuf[offset] == Constants.FT)
+ {
+ break;
+ }
+ offset++;
+ }
+ if (offset % 12 == 1)
+ {
+ // move one byte from body to leader, make new leader, and try again
+ byte oldBody[] = recordBuf;
+ recordBuf = new byte[oldBody.length-1];
+ System.arraycopy(oldBody, 1, recordBuf, 0, oldBody.length-1);
+ directoryLength = offset-1;
+ ldr.setIndicatorCount(2);
+ ldr.setSubfieldCodeLength(2);
+ ldr.setImplDefined1((""+(char)byteArray[7]+" ").toCharArray());
+ ldr.setImplDefined2((""+(char)byteArray[18]+(char)byteArray[19]+(char)byteArray[20]).toCharArray());
+ ldr.setEntryMap("4500".toCharArray());
+ if (byteArray[10] == (byte)' ' || byteArray[10] == (byte)'a') // if its ' ' or 'a'
+ {
+ ldr.setCharCodingScheme((char)byteArray[10]);
+ }
+ }
+ else if (offset % 12 == 11)
+ {
+ byte oldBody[] = recordBuf;
+ recordBuf = new byte[oldBody.length+1];
+ System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length);
+ recordBuf[0] = (byte)'0';
+ directoryLength = offset+1;
+ ldr.setIndicatorCount(2);
+ ldr.setSubfieldCodeLength(2);
+ ldr.setImplDefined1((""+(char)byteArray[7]+" ").toCharArray());
+ ldr.setImplDefined2((""+(char)byteArray[16]+(char)byteArray[17]+(char)byteArray[18]).toCharArray());
+ ldr.setEntryMap("4500".toCharArray());
+ if (byteArray[8] == (byte)' ' || byteArray[8] == (byte)'a') // if its ' ' or 'a'
+ {
+ ldr.setCharCodingScheme((char)byteArray[10]);
+ }
+ if (byteArray[10] == (byte)' ' || byteArray[10] == (byte)'a') // if its ' ' or 'a'
+ {
+ ldr.setCharCodingScheme((char)byteArray[10]);
+ }
+ }
+ else
+ {
+ throw new MarcException("error parsing leader with data: "
+ + new String(byteArray), e);
+ }
+ }
+ }
+ else
+ {
+ throw new MarcException("error parsing leader with data: "
+ + new String(byteArray), e);
+ }
+ }
+
+ // if MARC 21 then check encoding
+ switch (ldr.getCharCodingScheme()) {
+ case ' ':
+ if (!override)
+ encoding = "ISO-8859-1";
+ break;
+ case 'a':
+ if (!override)
+ encoding = "UTF8";
+ }
+ String utfCheck;
+ if (permissive && encoding == "UTF8")
+ {
+ try
+ {
+ utfCheck = new String(recordBuf, "UTF-8");
+ byte byteCheck[] = utfCheck.getBytes("UTF-8");
+ if (recordBuf.length != byteCheck.length)
+ {
+ }
+ for (int i = 0; i < recordBuf.length; i++)
+ {
+ if (recordBuf[i] == 0x1B || byteCheck[i] != recordBuf[i])
+ {
+ encoding = "MARC8-Maybe";
+ break;
+ }
+
+ }
+ if (utfCheck.contains("a$1!"))
+ {
+ encoding = "MARC8-Broken";
+ }
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ record.setLeader(ldr);
+
+ if ((directoryLength % 12) != 0)
+ {
+ if (permissive && directoryLength % 12 == 11 && recordBuf[1] != (byte)'0')
+ {
+ System.err.println("Warning: Corrupt record encountered, attempting to read permissively");
+ byte oldBody[] = recordBuf;
+ recordBuf = new byte[oldBody.length+1];
+ System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length);
+ recordBuf[0] = (byte)'0';
+ directoryLength = directoryLength+1;
+ }
+ else
+ {
+ throw new MarcException("invalid directory");
+ }
+ }
+ DataInputStream inputrec = new DataInputStream(new ByteArrayInputStream(recordBuf));
+ int size = directoryLength / 12;
+
+ String[] tags = new String[size];
+ int[] lengths = new int[size];
+
+ byte[] tag = new byte[3];
+ byte[] length = new byte[4];
+ byte[] start = new byte[5];
+
+ String tmp;
+
+ try {
+ for (int i = 0; i < size; i++)
+ {
+ inputrec.readFully(tag);
+ tmp = new String(tag);
+ tags[i] = tmp;
+
+ inputrec.readFully(length);
+ tmp = new String(length);
+ lengths[i] = Integer.parseInt(tmp);
+
+ inputrec.readFully(start);
+ }
+
+ if (inputrec.read() != Constants.FT)
+ {
+ throw new MarcException("expected field terminator at end of directory");
+ }
+
+ int numBadLengths = 0;
+
+ for (int i = 0; i < size; i++)
+ {
+ int fieldLength = getFieldLength(inputrec);
+ if (fieldLength+1 != lengths[i] && permissive)
+ {
+ if (numBadLengths < 3 && Math.abs(fieldLength - lengths[i]) < 10)
+ {
+ numBadLengths++;
+ lengths[i] = fieldLength+1;
+ }
+ }
+ if (Verifier.isControlField(tags[i]))
+ {
+ byteArray = new byte[lengths[i] - 1];
+ inputrec.readFully(byteArray);
+
+ if (inputrec.read() != Constants.FT)
+ {
+ throw new MarcException("expected field terminator at end of field");
+ }
+
+ ControlField field = factory.newControlField();
+ field.setTag(tags[i]);
+ field.setData(getDataAsString(byteArray));
+ record.addVariableField(field);
+
+ }
+ else
+ {
+ byteArray = new byte[lengths[i]];
+ inputrec.readFully(byteArray);
+
+ try {
+ record.addVariableField(parseDataField(tags[i],
+ byteArray));
+ } catch (IOException e) {
+ throw new MarcException(
+ "error parsing data field for tag: " + tags[i]
+ + " with data: "
+ + new String(byteArray), e);
+ }
+ }
+ }
+
+ // We've determined that although the record says it is UTF-8, it is not.
+ // Here we make an attempt to determine the actual encoding of the data in the record.
+ if (permissive && conversionCheck1.length() > 1 &&
+ conversionCheck2.length() > 1 && conversionCheck3.length() > 1)
+ {
+ int partToUse = 0;
+ if (conversionCheck2.length() < conversionCheck1.length()
+ && conversionCheck2.length() < conversionCheck3.length())
+ {
+ partToUse = 1;
+ }
+ else if (conversionCheck1.length() > conversionCheck3.length())
+ {
+ partToUse = 0;
+ }
+ else if (conversionCheck3.length() > conversionCheck1.length())
+ {
+ partToUse = 2;
+ }
+ else if (conversionCheck2.equals(conversionCheck3) && !conversionCheck1.trim().contains(" "))
+ {
+ partToUse = 2;
+ }
+ else if (numLetters(conversionCheck1) == 0)
+ {
+ partToUse = 0;
+ }
+ else
+ {
+ CharsetDetector detect = new CharsetDetector();
+ byte m8Bytes[] = null;
+ byte isoBytes[] = null;
+ byte uniBytes[] = null;
+ try
+ {
+ m8Bytes = conversionCheck1.getBytes("ISO-8859-1");
+ uniBytes = conversionCheck2.getBytes("ISO-8859-1");
+ isoBytes = conversionCheck3.getBytes("ISO-8859-1");
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ detect.setText(m8Bytes);
+ CharsetMatch match1 = langDetect(detect, record);
+ System.err.println(match1 != null ? match1.getName() + " " + match1.getConfidence() + " " + match1.getLanguage() : "No Match");
+
+ detect.setText(uniBytes);
+ CharsetMatch match2 = langDetect(detect, record);
+ System.err.println(match2 != null ? match2.getName() + " " + match2.getConfidence() + " " + match2.getLanguage() : "No Match");
+
+ detect.setText(isoBytes);
+ CharsetMatch match3 = langDetect(detect, record);
+ System.err.println(match3 != null ? match3.getName() + " " + match3.getConfidence() + " " + match3.getLanguage() : "No Match");
+
+ if (match1 == null && match2 == null && match3 == null)
+ {
+ partToUse = 0;
+ }
+ else if (match1 != null && (match2 == null || match3 == null))
+ {
+ partToUse = 0;
+ }
+ else if (match1.getConfidence() >= match2.getConfidence() &&
+ match1.getConfidence() >= match3.getConfidence() )
+ {
+ partToUse = 0;
+ }
+// else if (match2.getConfidence() > match1.getConfidence() &&
+// match2.getConfidence() > match3.getConfidence() )
+// {
+// partToUse = 1;
+// }
+ else if (match3.getConfidence() > match1.getConfidence() &&
+ match3.getConfidence() > match2.getConfidence() )
+ {
+ partToUse = 2;
+ }
+ }
+ List<VariableField> fields = record.getVariableFields();
+ Iterator<VariableField> iter = fields.iterator();
+ while (iter.hasNext())
+ {
+ VariableField field = iter.next();
+ if (field instanceof DataField)
+ {
+ DataField df = (DataField)field;
+ List<Subfield> subf = df.getSubfields();
+ Iterator<Subfield> sfiter = subf.iterator();
+ while (sfiter.hasNext())
+ {
+ Subfield sf = sfiter.next();
+ if (sf.getData().contains("%%@%%"))
+ {
+ String parts[] = sf.getData().split("%%@%%", 3);
+ sf.setData(parts[partToUse]);
+ }
+ }
+ }
+ }
+ }
+
+ if (inputrec.read() != Constants.RT)
+ {
+ throw new MarcException("expected record terminator");
+ }
+ }
+ catch (IOException e)
+ {
+ throw new MarcException("an error occured reading input", e);
+ }
+ }
+
+ private int arrayContainsAt(byte[] byteArray, int ft)
+ {
+ for (int i = 0; i < byteArray.length; i++)
+ {
+ if (byteArray[i] == (byte)ft) return(i);
+ }
+ return(-1);
+ }
+
+ private CharsetMatch langDetect(CharsetDetector detect, Record record)
+ {
+ String lang = extractLang(record);
+ if (lang != null && !lang.equals("eng"))
+ {
+ CharsetMatch matches[] = detect.detectAll();
+ for (int i = 0; i < matches.length; i++)
+ {
+ if (langMap(matches[i].getLanguage()).equals(lang))
+ {
+ return(matches[i]);
+ }
+ }
+ return(matches.length > 0 ? matches[0] : null);
+ }
+ else
+ {
+ return(detect.detect());
+ }
+ }
+
+ private Object langMap(String language)
+ {
+ if (langMap == null)
+ {
+ langMap = new HashMap<String, String>();
+ langMap.put("de", "ger");
+ langMap.put("nl", "dut");
+ langMap.put("fr", "fre");
+ langMap.put("fi", "fin");
+ langMap.put("sv", "swe");
+ langMap.put("it", "ita");
+ langMap.put("es", "spa");
+ langMap.put("en", "eng");
+ langMap.put("da", "dan");
+ langMap.put("no", "nor");
+ langMap.put("tr", "tur");
+ langMap.put("hu", "hun");
+ langMap.put("ro", "rum");
+ langMap.put("cs", "cze");
+ langMap.put("pt", "por");
+ langMap.put("pl", "pol");
+ langMap.put("ru", "rus");
+ langMap.put("ar", "ara");
+ langMap.put("el", "gre");
+ langMap.put("he", "heb");
+ langMap.put("pt", "por");
+ langMap.put("pl", "pol");
+ }
+ if (langMap.containsKey(language))
+ {
+ return(langMap.get(language));
+ }
+ return null;
+ }
+
+ private String extractLang(Record record)
+ {
+ VariableField f = record.getVariableField("008");
+ ControlField cf = (ControlField)f;
+ if (cf != null)
+ {
+ String data = cf.getData();
+ if (data.length() >= 38)
+ {
+ String lang = data.substring(35, 38);
+ return(lang);
+ }
+ }
+ return null;
+ }
+
+ private int numLetters(String conversionCheck)
+ {
+ int count = 0;
+ for (int i = 0; i < conversionCheck.length(); i++)
+ {
+ if (Character.isLetter(conversionCheck.charAt(i))) count++;
+ }
+ return(count);
+ }
+
+ private DataField parseDataField(String tag, byte[] field)
+ throws IOException {
+ ByteArrayInputStream bais = new ByteArrayInputStream(field);
+ char ind1 = (char) bais.read();
+ char ind2 = (char) bais.read();
+
+ DataField dataField = factory.newDataField();
+ dataField.setTag(tag);
+ dataField.setIndicator1(ind1);
+ dataField.setIndicator2(ind2);
+
+ int code;
+ int size;
+ int readByte;
+ byte[] data;
+ Subfield subfield;
+ while (true) {
+ readByte = bais.read();
+ if (readByte < 0)
+ break;
+ switch (readByte) {
+ case Constants.US:
+ code = bais.read();
+ if (code < 0)
+ throw new IOException("unexpected end of data field");
+ if (code == Constants.FT)
+ break;
+ size = getSubfieldLength(bais);
+ data = new byte[size];
+ bais.read(data);
+ subfield = factory.newSubfield();
+ subfield.setCode((char) code);
+ subfield.setData(getDataAsString(data));
+ dataField.addSubfield(subfield);
+ break;
+ case Constants.FT:
+ break;
+ }
+ }
+ return dataField;
+ }
+
+ private int getFieldLength(DataInputStream bais) throws IOException
+ {
+ bais.mark(9999);
+ int bytesRead = 0;
+ while (true) {
+ switch (bais.read()) {
+ case Constants.FT:
+ bais.reset();
+ return bytesRead;
+ case -1:
+ bais.reset();
+ if (permissive)
+ return (bytesRead);
+ else
+ throw new IOException("Field not terminated");
+ case Constants.US:
+ default:
+ bytesRead++;
+ }
+ }
+ }
+
+ private int getSubfieldLength(ByteArrayInputStream bais) throws IOException {
+ bais.mark(9999);
+ int bytesRead = 0;
+ while (true) {
+ switch (bais.read()) {
+ case Constants.US:
+ case Constants.FT:
+ bais.reset();
+ return bytesRead;
+ case -1:
+ bais.reset();
+ if (permissive)
+ return (bytesRead);
+ else
+ throw new IOException("subfield not terminated");
+ default:
+ bytesRead++;
+ }
+ }
+ }
+
+ private int parseRecordLength(byte[] leaderData) throws IOException {
+ InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
+ leaderData));
+ int length = -1;
+ char[] tmp = new char[5];
+ isr.read(tmp);
+ try {
+ length = Integer.parseInt(new String(tmp));
+ } catch (NumberFormatException e) {
+ throw new MarcException("unable to parse record length", e);
+ }
+ return(length);
+ }
+
+ private void parseLeader(Leader ldr, byte[] leaderData) throws IOException {
+ InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
+ leaderData));
+ char[] tmp = new char[5];
+ isr.read(tmp);
+ // Skip over bytes for record length, If we get here, its already been computed.
+ ldr.setRecordStatus((char) isr.read());
+ ldr.setTypeOfRecord((char) isr.read());
+ tmp = new char[2];
+ isr.read(tmp);
+ ldr.setImplDefined1(tmp);
+ ldr.setCharCodingScheme((char) isr.read());
+ char indicatorCount = (char) isr.read();
+ char subfieldCodeLength = (char) isr.read();
+ char baseAddr[] = new char[5];
+ isr.read(baseAddr);
+ tmp = new char[3];
+ isr.read(tmp);
+ ldr.setImplDefined2(tmp);
+ tmp = new char[4];
+ isr.read(tmp);
+ ldr.setEntryMap(tmp);
+ isr.close();
+ try {
+ ldr.setIndicatorCount(Integer.parseInt(String.valueOf(indicatorCount)));
+ } catch (NumberFormatException e) {
+ throw new MarcException("unable to parse indicator count", e);
+ }
+ try {
+ ldr.setSubfieldCodeLength(Integer.parseInt(String
+ .valueOf(subfieldCodeLength)));
+ } catch (NumberFormatException e) {
+ throw new MarcException("unable to parse subfield code length", e);
+ }
+ try {
+ ldr.setBaseAddressOfData(Integer.parseInt(new String(baseAddr)));
+ } catch (NumberFormatException e) {
+ throw new MarcException("unable to parse base address of data", e);
+ }
+
+ }
+
+ private String getDataAsString(byte[] bytes)
+ {
+ String dataElement = null;
+ if (encoding.equals("UTF-8") || encoding.equals("UTF8"))
+ {
+ try {
+ dataElement = new String(bytes, "UTF8");
+ }
+ catch (UnsupportedEncodingException e) {
+ throw new MarcException("unsupported encoding", e);
+ }
+ }
+ else if (encoding.equals("MARC-8") || encoding.equals("MARC8"))
+ {
+ if (converterAnsel == null) converterAnsel = new AnselToUnicode();
+ dataElement = converterAnsel.convert(bytes);
+ }
+ else if (encoding.equals("MARC8-Maybe"))
+ {
+ if (converterAnsel == null) converterAnsel = new AnselToUnicode();
+ if (converterUnimarc == null) converterUnimarc = new Iso5426ToUnicode();
+ String dataElement1 = converterAnsel.convert(bytes);
+ String dataElement2 = converterUnimarc.convert(bytes);
+ String dataElement3 = null;
+ try
+ {
+ dataElement3 = new String(bytes, "ISO-8859-1");
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if (dataElement1.equals(dataElement2) && dataElement1.equals(dataElement3))
+ {
+ dataElement = dataElement1;
+ }
+ else
+ {
+ conversionCheck1 = conversionCheck1 + " " + dataElement1;
+ conversionCheck2 = conversionCheck2 + " " + dataElement2;
+ conversionCheck3 = conversionCheck3 + " " + dataElement3;
+ dataElement = dataElement1 + "%%@%%" + dataElement2 + "%%@%%" + dataElement3;
+ }
+ }
+ else if (encoding.equals("MARC8-Broken"))
+ {
+ try
+ {
+ dataElement = new String(bytes, "ISO-8859-1");
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ dataElement = dataElement.replaceAll("&lt;", "<");
+ dataElement = dataElement.replaceAll("&gt;", ">");
+ dataElement = dataElement.replaceAll("&amp;", "&");
+ dataElement = dataElement.replaceAll("&apos;", "'");
+ dataElement = dataElement.replaceAll("&quot;", "\"");
+ String rep1 = ""+(char)0x1b+"\\$1";
+ String rep2 = ""+(char)0x1b+"\\(B";
+ dataElement = dataElement.replaceAll("\\$1", rep1);
+ dataElement = dataElement.replaceAll("\\(B", rep2);
+ dataElement = converterAnsel.convert(dataElement);
+
+ }
+ else if (encoding.equals("ISO-8859-1") || encoding.equals("ISO8859_1"))
+ {
+ try {
+ dataElement = new String(bytes, "ISO-8859-1");
+ }
+ catch (UnsupportedEncodingException e) {
+ throw new MarcException("unsupported encoding", e);
+ }
+ }
+ dataElement = dataElement.replaceAll("&lt;", "<");
+ dataElement = dataElement.replaceAll("&gt;", ">");
+ dataElement = dataElement.replaceAll("&amp;", "&");
+ dataElement = dataElement.replaceAll("&apos;", "'");
+ dataElement = dataElement.replaceAll("&quot;", "\"");
+ return dataElement;
+ }
+
+ public boolean isPermissive()
+ {
+ return permissive;
+ }
+
+ public void setPermissive(boolean permissive)
+ {
+ this.permissive = permissive;
+ }
+
+}
</ins><span class="cx">\ No newline at end of file
</span></span></pre>
</div>
</div>
</body>
</html>