<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><meta http-equiv="content-type" content="text/html; charset=utf-8" /><style type="text/css"><!--
#msg dl { border: 1px #006 solid; background: #369; padding: 6px; color: #fff; }
#msg dt { float: left; width: 6em; font-weight: bold; }
#msg dt:after { content:':';}
#msg dl, #msg dt, #msg ul, #msg li, #header, #footer { font-family: verdana,arial,helvetica,sans-serif; font-size: 10pt;  }
#msg dl a { font-weight: bold}
#msg dl a:link    { color:#fc3; }
#msg dl a:active  { color:#ff0; }
#msg dl a:visited { color:#cc6; }
h3 { font-family: verdana,arial,helvetica,sans-serif; font-size: 10pt; font-weight: bold; }
#msg pre, #msg p { overflow: auto; background: #ffc; border: 1px #fc0 solid; padding: 6px; }
#msg ul { overflow: auto; }
#header, #footer { color: #fff; background: #636; border: 1px #300 solid; padding: 6px; }
#patch { width: 100%; }
#patch h4 {font-family: verdana,arial,helvetica,sans-serif;font-size:10pt;padding:8px;background:#369;color:#fff;margin:0;}
#patch .propset h4, #patch .binary h4 {margin:0;}
#patch pre {padding:0;line-height:1.2em;margin:0;}
#patch .diff {width:100%;background:#eee;padding: 0 0 10px 0;overflow:auto;}
#patch .propset .diff, #patch .binary .diff  {padding:10px 0;}
#patch span {display:block;padding:0 10px;}
#patch .modfile, #patch .addfile, #patch .delfile, #patch .propset, #patch .binary, #patch .copfile {border:1px solid #ccc;margin:10px 0;}
#patch ins {background:#dfd;text-decoration:none;display:block;padding:0 10px;}
#patch del {background:#fdd;text-decoration:none;display:block;padding:0 10px;}
#patch .lines, .info {color:#888;background:#fff;}
--></style>
<title>[560] blacklight_importer/src/marcoverride:
  Updates to support more permissive reading</title>
</head>
<body>

<div id="msg">
<dl>
<dt>Revision</dt> <dd>560</dd>
<dt>Author</dt> <dd>haschart</dd>
<dt>Date</dt> <dd>2008-05-14 17:42:42 -0400 (Wed, 14 May 2008)</dd>
</dl>

<h3>Log Message</h3>
<pre>Updates to support more permissive reading</pre>

<h3>Modified Paths</h3>
<ul>
<li><a href="#blacklight_importersrcmarcoverrideMarcDirStreamReaderjava">blacklight_importer/src/marcoverride/MarcDirStreamReader.java</a></li>
</ul>

<h3>Added Paths</h3>
<ul>
<li><a href="#blacklight_importersrcmarcoverrideMarcPermissiveStreamReaderjava">blacklight_importer/src/marcoverride/MarcPermissiveStreamReader.java</a></li>
</ul>

</div>
<div id="patch">
<h3>Diff</h3>
<a id="blacklight_importersrcmarcoverrideMarcDirStreamReaderjava"></a>
<div class="modfile"><h4>Modified: blacklight_importer/src/marcoverride/MarcDirStreamReader.java (559 => 560)</h4>
<pre class="diff"><span>
<span class="info">--- blacklight_importer/src/marcoverride/MarcDirStreamReader.java        2008-05-14 21:42:06 UTC (rev 559)
+++ blacklight_importer/src/marcoverride/MarcDirStreamReader.java        2008-05-14 21:42:42 UTC (rev 560)
</span><span class="lines">@@ -15,20 +15,32 @@
</span><span class="cx">     File list[];
</span><span class="cx">     MarcReader curFileReader;
</span><span class="cx">     int curFileNum;
</span><ins>+    boolean permissive;
</ins><span class="cx">     
</span><span class="cx">     public MarcDirStreamReader(String dirName)
</span><span class="cx">     {
</span><span class="cx">         File dir = new File(dirName);
</span><del>-        init(dir);
</del><ins>+        init(dir, false);
</ins><span class="cx">     }
</span><span class="cx">     
</span><span class="cx">     public MarcDirStreamReader(File dir)
</span><span class="cx">     {
</span><del>-        init(dir);
</del><ins>+        init(dir, false);
</ins><span class="cx">     }
</span><span class="cx"> 
</span><del>-    private void init(File dir)
</del><ins>+    public MarcDirStreamReader(String dirName, boolean permissive)
</ins><span class="cx">     {
</span><ins>+        File dir = new File(dirName);
+        init(dir, permissive);
+    }
+    
+    public MarcDirStreamReader(File dir, boolean permissive)
+    {
+        init(dir, permissive);
+    }
+
+    private void init(File dir, boolean permissive)
+    {
</ins><span class="cx">         FilenameFilter filter = new FilenameFilter()
</span><span class="cx">         {
</span><span class="cx">             public boolean accept(File dir, String name)
</span><span class="lines">@@ -36,6 +48,7 @@
</span><span class="cx">                 return(name.endsWith(&quot;mrc&quot;));
</span><span class="cx">             }
</span><span class="cx">         };
</span><ins>+        this.permissive = permissive; 
</ins><span class="cx">         list = dir.listFiles(filter);
</span><span class="cx">         java.util.Arrays.sort(list);
</span><span class="cx">         curFileNum = 0;
</span><span class="lines">@@ -58,7 +71,7 @@
</span><span class="cx">             try
</span><span class="cx">             {
</span><span class="cx">                 System.err.println(&quot;Switching to input file: &quot;+ list[curFileNum]);
</span><del>-                curFileReader = new MarcStreamReader(new FileInputStream(list[curFileNum++]));
</del><ins>+                curFileReader = new MarcPermissiveStreamReader(new FileInputStream(list[curFileNum++]), permissive);
</ins><span class="cx">             }
</span><span class="cx">             catch (FileNotFoundException e)
</span><span class="cx">             {
</span></span></pre></div>
<a id="blacklight_importersrcmarcoverrideMarcPermissiveStreamReaderjava"></a>
<div class="addfile"><h4>Added: blacklight_importer/src/marcoverride/MarcPermissiveStreamReader.java (0 => 560)</h4>
<pre class="diff"><span>
<span class="info">--- blacklight_importer/src/marcoverride/MarcPermissiveStreamReader.java                                (rev 0)
+++ blacklight_importer/src/marcoverride/MarcPermissiveStreamReader.java        2008-05-14 21:42:42 UTC (rev 560)
</span><span class="lines">@@ -0,0 +1,895 @@
</span><ins>+// $Id: MarcStreamReader.java,v 1.10 2006/12/04 18:45:44 bpeters Exp $
+/**
+ * Copyright (C) 2004 Bas Peters
+ *
+ * This file is part of MARC4J
+ *
+ * MARC4J is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public 
+ * License as published by the Free Software Foundation; either 
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * MARC4J is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public 
+ * License along with MARC4J; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package marcoverride;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+
+import org.marc4j.Constants;
+import org.marc4j.MarcException;
+import org.marc4j.MarcReader;
+import org.marc4j.converter.CharConverter;
+import org.marc4j.converter.impl.AnselToUnicode;
+import org.marc4j.converter.impl.Iso5426ToUnicode;
+import org.marc4j.marc.ControlField;
+import org.marc4j.marc.DataField;
+import org.marc4j.marc.Leader;
+import org.marc4j.marc.MarcFactory;
+import org.marc4j.marc.Record;
+import org.marc4j.marc.Subfield;
+import org.marc4j.marc.VariableField;
+import org.marc4j.marc.impl.Verifier;
+
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+
+/**
+ * An iterator over a collection of MARC records in ISO 2709 format.
+ * &lt;p&gt;
+ * Example usage:
+ * 
+ * &lt;pre&gt;
+ * InputStream input = new FileInputStream(&amp;quot;file.mrc&amp;quot;);
+ * MarcReader reader = new MarcStreamReader(input);
+ * while (reader.hasNext()) {
+ *     Record record = reader.next();
+ *     // Process record
+ * }
+ * &lt;/pre&gt;
+ * 
+ * &lt;p&gt;
+ * Check the {@link org.marc4j.marc}&amp;nbsp;package for examples about the use of
+ * the {@link org.marc4j.marc.Record}&amp;nbsp;object model.
+ * &lt;/p&gt;
+ * 
+ * &lt;p&gt;
+ * When no encoding is given as an constructor argument the parser tries to
+ * resolve the encoding by looking at the character coding scheme (leader
+ * position 9) in MARC21 records. For UNIMARC records this position is not
+ * defined.
+ * &lt;/p&gt;
+ * 
+ * @author Bas Peters
+ * @version $Revision: 1.10 $
+ * 
+ */
+public class MarcPermissiveStreamReader implements MarcReader {
+
+    private DataInputStream input = null;
+
+    private Record record;
+
+    private MarcFactory factory;
+
+    private String encoding = &quot;ISO8859_1&quot;;
+
+    private boolean override = false;
+
+    private boolean hasNext = true;
+   
+    private boolean permissive = false;
+    
+    private CharConverter converterAnsel = null;
+
+    private CharConverter converterUnimarc = null;
+    
+    private String conversionCheck1 = null;
+    
+    private String conversionCheck2 = null;
+
+    private String conversionCheck3 = null;
+
+    private static HashMap&lt;String, String&gt; langMap = null;
+
+    /**
+     * Constructs an instance with the specified input stream.
+     */
+    public MarcPermissiveStreamReader(InputStream input, boolean permissive) {
+        this(input, null, permissive);
+    }
+
+    /**
+     * Constructs an instance with the specified input stream.
+     */
+    public MarcPermissiveStreamReader(InputStream input, String encoding) {
+        this(input, encoding, true);
+    }
+
+    /**
+     * Constructs an instance with the specified input stream and character
+     * encoding.
+     */
+    public MarcPermissiveStreamReader(InputStream input, String encoding, boolean permissive) {
+        this.permissive = permissive;
+        this.input = new DataInputStream(new BufferedInputStream(input));
+        factory = MarcFactory.newInstance();
+        if (encoding != null) {
+            this.encoding = encoding;
+            override = true;
+        }
+    }
+
+    /**
+     * Returns true if the iteration has more records, false otherwise.
+     */
+    public boolean hasNext() {
+        try {
+            if (input.available() == 0)
+                return false;
+        } catch (IOException e) {
+            throw new MarcException(e.getMessage(), e);
+        }
+        return true;
+    }
+
+    /**
+     * Returns the next record in the iteration.
+     * 
+     * @return Record - the record object
+     */
+    public Record next() 
+    {
+        record = factory.newRecord();
+
+        try {
+
+            byte[] byteArray = new byte[24];
+            input.readFully(byteArray);
+
+            int recordLength = parseRecordLength(byteArray);
+            byte[] recordBuf = new byte[recordLength - 24];
+            if (permissive) 
+            {
+                input.mark(recordLength * 2);
+                input.readFully(recordBuf);
+                if (recordBuf[recordBuf.length-1] != Constants.RT)
+                {
+                    recordBuf = rereadPermissively(input, recordBuf, recordLength);
+                    recordLength = recordBuf.length + 24;
+                }
+            }
+            else
+            {
+                input.readFully(recordBuf);
+            }
+            String tmp = new String(recordBuf);
+            parseRecord(record, byteArray, recordBuf, recordLength);
+            return(record);
+        }
+        catch (EOFException e) {
+            throw new MarcException(&quot;Premature end of file encountered&quot;, e);
+        } 
+        catch (IOException e) {
+            throw new MarcException(&quot;an error occured reading input&quot;, e);
+        }   
+    }
+    
+    private byte[] rereadPermissively(DataInputStream input, byte[] recordBuf, int recordLength) throws IOException
+    {
+        int loc = arrayContainsAt(recordBuf, Constants.RT);
+        if (loc != -1)  // stated record length is too long
+        {
+            recordLength = loc + 24;
+            input.reset();
+            recordBuf = new byte[recordLength - 24];
+            input.readFully(recordBuf);
+        }
+        else  // stated record length is too short read ahead
+        {
+            loc = recordLength - 24;
+            int c = 0;
+            do 
+            {
+                c = input.read();
+                loc++;
+            } while (loc &lt; recordLength + 100 &amp;&amp; c != Constants.RT &amp;&amp; c != -1);

+            if (c == Constants.RT)
+            {
+                recordLength = loc + 24;
+                input.reset();
+                recordBuf = new byte[recordLength - 24];
+                input.readFully(recordBuf);
+            }
+            else if (c == -1)
+            {
+                recordLength = loc + 24;
+                input.reset();
+                recordBuf = new byte[recordLength - 24 + 1];
+                input.readFully(recordBuf);
+                recordBuf[recordBuf.length-1] = Constants.RT;  
+            }
+        }
+        return(recordBuf);
+    }
+        
+    private void parseRecord(Record record, byte[] byteArray, byte[] recordBuf, int recordLength)
+    {
+        Leader ldr;
+        ldr = factory.newLeader();
+        ldr.setRecordLength(recordLength);
+        int directoryLength=0;
+        conversionCheck1 = &quot;&quot;;
+        conversionCheck2 = &quot;&quot;;
+        conversionCheck3 = &quot;&quot;;
+        
+        try {                
+            parseLeader(ldr, byteArray);
+            directoryLength = ldr.getBaseAddressOfData() - (24 + 1);
+        } 
+        catch (IOException e) {
+            throw new MarcException(&quot;error parsing leader with data: &quot;
+                    + new String(byteArray), e);
+        } 
+        catch (MarcException e) {
+            if (permissive)
+            {
+                if (recordBuf[recordBuf.length-1] == Constants.RT &amp;&amp; recordBuf[recordBuf.length-2] == Constants.FT)
+                {
+                    System.err.println(&quot;Warning: Corrupt record encountered, attempting to read permissively&quot;);
+                    // make an attempt to recover record.
+                    int offset = 0;
+                    while (offset &lt; recordBuf.length)
+                    {
+                        if (recordBuf[offset] == Constants.FT)
+                        {
+                            break;
+                        }
+                        offset++;
+                    }
+                    if (offset % 12 == 1)
+                    {
+                        // move one byte from body to leader, make new leader, and try again
+                        byte oldBody[] = recordBuf;
+                        recordBuf = new byte[oldBody.length-1];
+                        System.arraycopy(oldBody, 1, recordBuf, 0, oldBody.length-1);
+                        directoryLength = offset-1;
+                        ldr.setIndicatorCount(2);
+                        ldr.setSubfieldCodeLength(2);
+                        ldr.setImplDefined1((&quot;&quot;+(char)byteArray[7]+&quot; &quot;).toCharArray());
+                        ldr.setImplDefined2((&quot;&quot;+(char)byteArray[18]+(char)byteArray[19]+(char)byteArray[20]).toCharArray());
+                        ldr.setEntryMap(&quot;4500&quot;.toCharArray());
+                        if (byteArray[10] == (byte)' ' || byteArray[10] == (byte)'a') // if its ' ' or 'a'
+                        {
+                            ldr.setCharCodingScheme((char)byteArray[10]);
+                        }
+                    }
+                    else if (offset % 12 == 11) 
+                    {
+                        byte oldBody[] = recordBuf;
+                        recordBuf = new byte[oldBody.length+1];
+                        System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length);
+                        recordBuf[0] = (byte)'0';
+                        directoryLength = offset+1;
+                        ldr.setIndicatorCount(2);
+                        ldr.setSubfieldCodeLength(2);
+                        ldr.setImplDefined1((&quot;&quot;+(char)byteArray[7]+&quot; &quot;).toCharArray());
+                        ldr.setImplDefined2((&quot;&quot;+(char)byteArray[16]+(char)byteArray[17]+(char)byteArray[18]).toCharArray());
+                        ldr.setEntryMap(&quot;4500&quot;.toCharArray());
+                        if (byteArray[8] == (byte)' ' || byteArray[8] == (byte)'a') // if its ' ' or 'a'
+                        {
+                            ldr.setCharCodingScheme((char)byteArray[10]);
+                        }
+                        if (byteArray[10] == (byte)' ' || byteArray[10] == (byte)'a') // if its ' ' or 'a'
+                        {
+                            ldr.setCharCodingScheme((char)byteArray[10]);
+                        }
+                    }
+                    else
+                    {
+                        throw new MarcException(&quot;error parsing leader with data: &quot;
+                                + new String(byteArray), e);
+                    }
+                }
+            }
+            else
+            {
+                throw new MarcException(&quot;error parsing leader with data: &quot;
+                        + new String(byteArray), e);
+            }
+        }
+
+        // if MARC 21 then check encoding
+        switch (ldr.getCharCodingScheme()) {
+        case ' ':
+            if (!override)
+                encoding = &quot;ISO-8859-1&quot;;
+            break;
+        case 'a':
+            if (!override)
+                encoding = &quot;UTF8&quot;;
+        }
+        String utfCheck;
+        if (permissive &amp;&amp; encoding == &quot;UTF8&quot;)
+        {
+            try
+            {
+                utfCheck = new String(recordBuf, &quot;UTF-8&quot;);
+                byte byteCheck[] = utfCheck.getBytes(&quot;UTF-8&quot;);
+                if (recordBuf.length != byteCheck.length)
+                {
+                }
+                for (int i = 0; i &lt; recordBuf.length; i++)
+                {
+                    if (recordBuf[i] == 0x1B || byteCheck[i] != recordBuf[i])
+                    {
+                        encoding = &quot;MARC8-Maybe&quot;;
+                        break;
+                    }
+                    
+                }
+                if (utfCheck.contains(&quot;a$1!&quot;))
+                {
+                    encoding = &quot;MARC8-Broken&quot;;
+                }
+            }
+            catch (UnsupportedEncodingException e)
+            {
+                // TODO Auto-generated catch block
+                e.printStackTrace();
+            }
+        }
+        record.setLeader(ldr);
+        
+        if ((directoryLength % 12) != 0)
+        {
+            if (permissive &amp;&amp; directoryLength % 12 == 11 &amp;&amp; recordBuf[1] != (byte)'0') 
+            {
+                System.err.println(&quot;Warning: Corrupt record encountered, attempting to read permissively&quot;);
+                byte oldBody[] = recordBuf;
+                recordBuf = new byte[oldBody.length+1];
+                System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length);
+                recordBuf[0] = (byte)'0';
+                directoryLength = directoryLength+1;
+            }
+            else
+            {
+                throw new MarcException(&quot;invalid directory&quot;);
+            }
+        }
+        DataInputStream inputrec = new DataInputStream(new ByteArrayInputStream(recordBuf));
+        int size = directoryLength / 12;
+
+        String[] tags = new String[size];
+        int[] lengths = new int[size];
+
+        byte[] tag = new byte[3];
+        byte[] length = new byte[4];
+        byte[] start = new byte[5];
+
+        String tmp;
+
+        try {
+            for (int i = 0; i &lt; size; i++) 
+            {
+                inputrec.readFully(tag);                
+                tmp = new String(tag);
+                tags[i] = tmp;
+    
+                inputrec.readFully(length);
+                tmp = new String(length);
+                lengths[i] = Integer.parseInt(tmp);
+    
+                inputrec.readFully(start);
+            }
+    
+            if (inputrec.read() != Constants.FT)
+            {
+                throw new MarcException(&quot;expected field terminator at end of directory&quot;);
+            }
+            
+            int numBadLengths = 0;
+            
+            for (int i = 0; i &lt; size; i++) 
+            {
+                int fieldLength = getFieldLength(inputrec);
+                if (fieldLength+1 != lengths[i] &amp;&amp; permissive)
+                {
+                    if (numBadLengths &lt; 3 &amp;&amp; Math.abs(fieldLength - lengths[i]) &lt; 10)
+                    {
+                        numBadLengths++;
+                        lengths[i] = fieldLength+1;
+                    }
+                }
+                if (Verifier.isControlField(tags[i])) 
+                {
+                    byteArray = new byte[lengths[i] - 1];
+                    inputrec.readFully(byteArray);
+    
+                    if (inputrec.read() != Constants.FT)
+                    {
+                        throw new MarcException(&quot;expected field terminator at end of field&quot;);
+                    }
+    
+                    ControlField field = factory.newControlField();
+                    field.setTag(tags[i]);
+                    field.setData(getDataAsString(byteArray));
+                    record.addVariableField(field);
+    
+                } 
+                else 
+                {
+                    byteArray = new byte[lengths[i]];
+                    inputrec.readFully(byteArray);
+    
+                    try {
+                        record.addVariableField(parseDataField(tags[i],
+                                byteArray));
+                    } catch (IOException e) {
+                        throw new MarcException(
+                                &quot;error parsing data field for tag: &quot; + tags[i]
+                                        + &quot; with data: &quot;
+                                        + new String(byteArray), e);
+                    }
+                }
+            }
+            
+            // We've determined that although the record says it is UTF-8, it is not. 
+            // Here we make an attempt to determine the actual encoding of the data in the record.
+            if (permissive &amp;&amp; conversionCheck1.length() &gt; 1 &amp;&amp; 
+                    conversionCheck2.length() &gt; 1 &amp;&amp; conversionCheck3.length() &gt; 1)
+            {
+                int partToUse = 0;
+                if (conversionCheck2.length() &lt; conversionCheck1.length()
+                        &amp;&amp; conversionCheck2.length() &lt; conversionCheck3.length())
+                {
+                    partToUse = 1;
+                }
+                else if (conversionCheck1.length() &gt; conversionCheck3.length())
+                {
+                    partToUse = 0;
+                }
+                else if (conversionCheck3.length() &gt; conversionCheck1.length())
+                {
+                    partToUse = 2;
+                }
+                else if (conversionCheck2.equals(conversionCheck3) &amp;&amp; !conversionCheck1.trim().contains(&quot; &quot;))
+                {
+                    partToUse = 2;
+                }
+                else if (numLetters(conversionCheck1) == 0)
+                {
+                    partToUse = 0;
+                }
+                else
+                {
+                    CharsetDetector detect = new CharsetDetector();
+                    byte m8Bytes[] = null;
+                    byte isoBytes[] = null;
+                    byte uniBytes[] = null;
+                    try
+                    {
+                        m8Bytes = conversionCheck1.getBytes(&quot;ISO-8859-1&quot;);
+                        uniBytes = conversionCheck2.getBytes(&quot;ISO-8859-1&quot;);
+                        isoBytes = conversionCheck3.getBytes(&quot;ISO-8859-1&quot;);
+                    }
+                    catch (UnsupportedEncodingException e)
+                    {
+                        // TODO Auto-generated catch block
+                        e.printStackTrace();
+                    }
+                    detect.setText(m8Bytes);
+                    CharsetMatch match1 = langDetect(detect, record);
+                    System.err.println(match1 != null ? match1.getName() + &quot; &quot; + match1.getConfidence() + &quot; &quot; + match1.getLanguage() : &quot;No Match&quot;);
+                    
+                    detect.setText(uniBytes);
+                    CharsetMatch match2 = langDetect(detect, record);
+                    System.err.println(match2 != null ? match2.getName() + &quot; &quot; + match2.getConfidence() + &quot; &quot; + match2.getLanguage() : &quot;No Match&quot;);
+                    
+                    detect.setText(isoBytes);
+                    CharsetMatch match3 = langDetect(detect, record);
+                    System.err.println(match3 != null ? match3.getName() + &quot; &quot; + match3.getConfidence() + &quot; &quot; + match3.getLanguage() : &quot;No Match&quot;);
+                    
+                    if (match1 == null &amp;&amp; match2 == null &amp;&amp; match3 == null)
+                    {
+                        partToUse = 0;
+                    }
+                    else if (match1 != null &amp;&amp; (match2 == null || match3 == null))
+                    {
+                        partToUse = 0;
+                    }
+                    else if (match1.getConfidence() &gt;= match2.getConfidence() &amp;&amp; 
+                            match1.getConfidence() &gt;= match3.getConfidence() )
+                    {
+                        partToUse = 0;
+                    }
+//                    else if (match2.getConfidence() &gt; match1.getConfidence() &amp;&amp; 
+//                            match2.getConfidence() &gt; match3.getConfidence() )
+//                    {
+//                        partToUse = 1;
+//                    }
+                    else if (match3.getConfidence() &gt; match1.getConfidence() &amp;&amp; 
+                            match3.getConfidence() &gt; match2.getConfidence() )
+                    {
+                        partToUse = 2;
+                    }
+                }
+                List&lt;VariableField&gt; fields = record.getVariableFields();
+                Iterator&lt;VariableField&gt; iter = fields.iterator();
+                while (iter.hasNext())
+                {
+                    VariableField field = iter.next();
+                    if (field instanceof DataField)
+                    {
+                        DataField df = (DataField)field;
+                        List&lt;Subfield&gt; subf = df.getSubfields();
+                        Iterator&lt;Subfield&gt; sfiter = subf.iterator();
+                        while (sfiter.hasNext())
+                        {
+                            Subfield sf = sfiter.next();
+                            if (sf.getData().contains(&quot;%%@%%&quot;))
+                            {
+                                String parts[] = sf.getData().split(&quot;%%@%%&quot;, 3);
+                                sf.setData(parts[partToUse]);
+                            }
+                        }
+                    }
+                }                      
+            }
+
+            if (inputrec.read() != Constants.RT)
+            {
+                throw new MarcException(&quot;expected record terminator&quot;);
+            } 
+        }
+        catch (IOException e)
+        {
+            throw new MarcException(&quot;an error occured reading input&quot;, e);            
+        }
+    }
+
+    private int arrayContainsAt(byte[] byteArray, int ft)
+    {
+        for (int i = 0; i &lt; byteArray.length; i++)
+        {
+            if (byteArray[i] == (byte)ft)  return(i);
+        }
+        return(-1);
+    }
+
+    private CharsetMatch langDetect(CharsetDetector detect, Record record)
+    {
+        String lang = extractLang(record);
+        if (lang != null &amp;&amp; !lang.equals(&quot;eng&quot;))
+        {
+            CharsetMatch matches[] = detect.detectAll();
+            for (int i = 0; i &lt; matches.length; i++)
+            {
+                if (langMap(matches[i].getLanguage()).equals(lang))
+                {
+                    return(matches[i]);
+                }
+            }
+            return(matches.length &gt; 0 ? matches[0] : null);
+        }
+        else
+        {
+            return(detect.detect());
+        }
+    }
+
+    private Object langMap(String language)
+    {
+        if (langMap == null) 
+        {
+            langMap = new HashMap&lt;String, String&gt;();
+            langMap.put(&quot;de&quot;, &quot;ger&quot;);
+            langMap.put(&quot;nl&quot;, &quot;dut&quot;);
+            langMap.put(&quot;fr&quot;, &quot;fre&quot;);
+            langMap.put(&quot;fi&quot;, &quot;fin&quot;);
+            langMap.put(&quot;sv&quot;, &quot;swe&quot;);
+            langMap.put(&quot;it&quot;, &quot;ita&quot;);
+            langMap.put(&quot;es&quot;, &quot;spa&quot;);
+            langMap.put(&quot;en&quot;, &quot;eng&quot;);
+            langMap.put(&quot;da&quot;, &quot;dan&quot;);
+            langMap.put(&quot;no&quot;, &quot;nor&quot;);
+            langMap.put(&quot;tr&quot;, &quot;tur&quot;);
+            langMap.put(&quot;hu&quot;, &quot;hun&quot;);
+            langMap.put(&quot;ro&quot;, &quot;rum&quot;);
+            langMap.put(&quot;cs&quot;, &quot;cze&quot;);
+            langMap.put(&quot;pt&quot;, &quot;por&quot;);
+            langMap.put(&quot;pl&quot;, &quot;pol&quot;);
+            langMap.put(&quot;ru&quot;, &quot;rus&quot;);
+            langMap.put(&quot;ar&quot;, &quot;ara&quot;);
+            langMap.put(&quot;el&quot;, &quot;gre&quot;);
+            langMap.put(&quot;he&quot;, &quot;heb&quot;);
+            langMap.put(&quot;pt&quot;, &quot;por&quot;);
+            langMap.put(&quot;pl&quot;, &quot;pol&quot;);
+        }
+        if (langMap.containsKey(language))
+        {
+            return(langMap.get(language));
+        }
+        return null;
+    }
+
+    private String extractLang(Record record)
+    {
+        VariableField f = record.getVariableField(&quot;008&quot;);
+        ControlField cf = (ControlField)f;
+        if (cf != null)
+        {
+            String data = cf.getData();
+            if (data.length() &gt;= 38)
+            {
+                String lang = data.substring(35, 38);
+                return(lang);
+            }
+        }
+        return null;
+    }
+
+    private int numLetters(String conversionCheck)
+    {
+        int count = 0;
+        for (int i = 0; i &lt; conversionCheck.length(); i++)
+        {
+            if (Character.isLetter(conversionCheck.charAt(i)))   count++;
+        }
+        return(count);
+    }
+
+    private DataField parseDataField(String tag, byte[] field)
+            throws IOException {
+        ByteArrayInputStream bais = new ByteArrayInputStream(field);
+        char ind1 = (char) bais.read();
+        char ind2 = (char) bais.read();
+
+        DataField dataField = factory.newDataField();
+        dataField.setTag(tag);
+        dataField.setIndicator1(ind1);
+        dataField.setIndicator2(ind2);
+
+        int code;
+        int size;
+        int readByte;
+        byte[] data;
+        Subfield subfield;
+        while (true) {
+            readByte = bais.read();
+            if (readByte &lt; 0)
+                break;
+            switch (readByte) {
+            case Constants.US:
+                code = bais.read();
+                if (code &lt; 0)
+                    throw new IOException(&quot;unexpected end of data field&quot;);
+                if (code == Constants.FT)
+                    break;
+                size = getSubfieldLength(bais);
+                data = new byte[size];
+                bais.read(data);
+                subfield = factory.newSubfield();
+                subfield.setCode((char) code);
+                subfield.setData(getDataAsString(data));
+                dataField.addSubfield(subfield);
+                break;
+            case Constants.FT:
+                break;
+            }
+        }
+        return dataField;
+    }
+    
+    private int getFieldLength(DataInputStream bais) throws IOException 
+    {
+        bais.mark(9999);
+        int bytesRead = 0;
+        while (true) {
+            switch (bais.read()) {
+             case Constants.FT:
+                bais.reset();
+                return bytesRead;
+            case -1:
+                bais.reset();
+                if (permissive)
+                    return (bytesRead);
+                else
+                    throw new IOException(&quot;Field not terminated&quot;);
+            case Constants.US:
+            default:
+                bytesRead++;
+            }
+        }
+    }
+
+    private int getSubfieldLength(ByteArrayInputStream bais) throws IOException {
+        bais.mark(9999);
+        int bytesRead = 0;
+        while (true) {
+            switch (bais.read()) {
+            case Constants.US:
+            case Constants.FT:
+                bais.reset();
+                return bytesRead;
+            case -1:
+                bais.reset();
+                if (permissive)
+                    return (bytesRead);
+                else
+                    throw new IOException(&quot;subfield not terminated&quot;);
+            default:
+                bytesRead++;
+            }
+        }
+    }
+
+    private int parseRecordLength(byte[] leaderData) throws IOException {
+        InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
+                leaderData));
+        int length = -1;
+        char[] tmp = new char[5];
+        isr.read(tmp);
+        try {
+            length = Integer.parseInt(new String(tmp));
+        } catch (NumberFormatException e) {
+            throw new MarcException(&quot;unable to parse record length&quot;, e);
+        }
+        return(length);
+    }
+    
+    private void parseLeader(Leader ldr, byte[] leaderData) throws IOException {
+        InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
+                leaderData));
+        char[] tmp = new char[5];
+        isr.read(tmp);
+        //  Skip over bytes for record length, If we get here, its already been computed.
+        ldr.setRecordStatus((char) isr.read());
+        ldr.setTypeOfRecord((char) isr.read());
+        tmp = new char[2];
+        isr.read(tmp);
+        ldr.setImplDefined1(tmp);
+        ldr.setCharCodingScheme((char) isr.read());
+        char indicatorCount = (char) isr.read();
+        char subfieldCodeLength = (char) isr.read();
+        char baseAddr[] = new char[5];
+        isr.read(baseAddr);
+        tmp = new char[3];
+        isr.read(tmp);
+        ldr.setImplDefined2(tmp);
+        tmp = new char[4];
+        isr.read(tmp);
+        ldr.setEntryMap(tmp);
+        isr.close();
+        try {
+            ldr.setIndicatorCount(Integer.parseInt(String.valueOf(indicatorCount)));
+        } catch (NumberFormatException e) {
+            throw new MarcException(&quot;unable to parse indicator count&quot;, e);
+        }
+        try {
+            ldr.setSubfieldCodeLength(Integer.parseInt(String
+                    .valueOf(subfieldCodeLength)));
+        } catch (NumberFormatException e) {
+            throw new MarcException(&quot;unable to parse subfield code length&quot;, e);
+        }
+        try {
+            ldr.setBaseAddressOfData(Integer.parseInt(new String(baseAddr)));
+        } catch (NumberFormatException e) {
+            throw new MarcException(&quot;unable to parse base address of data&quot;, e);
+        }
+
+    }
+
+    private String getDataAsString(byte[] bytes) 
+    {
+        String dataElement = null;
+        if (encoding.equals(&quot;UTF-8&quot;) || encoding.equals(&quot;UTF8&quot;))
+        {
+            try {
+                dataElement = new String(bytes, &quot;UTF8&quot;);
+            } 
+            catch (UnsupportedEncodingException e) {
+                throw new MarcException(&quot;unsupported encoding&quot;, e);
+            }
+        }
+        else if (encoding.equals(&quot;MARC-8&quot;) || encoding.equals(&quot;MARC8&quot;))
+        {
+            if (converterAnsel == null) converterAnsel = new AnselToUnicode();
+            dataElement = converterAnsel.convert(bytes);
+        }
+        else if (encoding.equals(&quot;MARC8-Maybe&quot;))
+        {
+            if (converterAnsel == null) converterAnsel = new AnselToUnicode();
+            if (converterUnimarc == null) converterUnimarc = new Iso5426ToUnicode();
+            String dataElement1 = converterAnsel.convert(bytes);
+            String dataElement2 = converterUnimarc.convert(bytes);
+            String dataElement3 = null;
+            try
+            {
+                dataElement3 = new String(bytes, &quot;ISO-8859-1&quot;);
+            }
+            catch (UnsupportedEncodingException e)
+            {
+                // TODO Auto-generated catch block
+                e.printStackTrace();
+            }
+            if (dataElement1.equals(dataElement2) &amp;&amp; dataElement1.equals(dataElement3))
+            {
+                dataElement = dataElement1;
+            }
+            else 
+            {
+                conversionCheck1 = conversionCheck1 + &quot; &quot; + dataElement1;
+                conversionCheck2 = conversionCheck2 + &quot; &quot; + dataElement2;
+                conversionCheck3 = conversionCheck3 + &quot; &quot; + dataElement3;
+                dataElement = dataElement1 + &quot;%%@%%&quot; + dataElement2 + &quot;%%@%%&quot; + dataElement3;                
+            }            
+        }
+        else if (encoding.equals(&quot;MARC8-Broken&quot;))
+        {
+            try
+            {
+                dataElement = new String(bytes, &quot;ISO-8859-1&quot;);
+            }
+            catch (UnsupportedEncodingException e)
+            {
+                // TODO Auto-generated catch block
+                e.printStackTrace();
+            }
+            dataElement = dataElement.replaceAll(&quot;&amp;lt;&quot;, &quot;&lt;&quot;);
+            dataElement = dataElement.replaceAll(&quot;&amp;gt;&quot;, &quot;&gt;&quot;);
+            dataElement = dataElement.replaceAll(&quot;&amp;amp;&quot;, &quot;&amp;&quot;);
+            dataElement = dataElement.replaceAll(&quot;&amp;apos;&quot;, &quot;'&quot;);
+            dataElement = dataElement.replaceAll(&quot;&amp;quot;&quot;, &quot;\&quot;&quot;);
+            String rep1 = &quot;&quot;+(char)0x1b+&quot;\\$1&quot;;
+            String rep2 = &quot;&quot;+(char)0x1b+&quot;\\(B&quot;;                    
+            dataElement = dataElement.replaceAll(&quot;\\$1&quot;, rep1);
+            dataElement = dataElement.replaceAll(&quot;\\(B&quot;, rep2);
+            dataElement = converterAnsel.convert(dataElement);
+
+        }
+        else if (encoding.equals(&quot;ISO-8859-1&quot;) || encoding.equals(&quot;ISO8859_1&quot;))
+        {
+            try {
+                dataElement = new String(bytes, &quot;ISO-8859-1&quot;);
+            } 
+            catch (UnsupportedEncodingException e) {
+                throw new MarcException(&quot;unsupported encoding&quot;, e);
+            }
+        }
+        dataElement = dataElement.replaceAll(&quot;&amp;lt;&quot;, &quot;&lt;&quot;);
+        dataElement = dataElement.replaceAll(&quot;&amp;gt;&quot;, &quot;&gt;&quot;);
+        dataElement = dataElement.replaceAll(&quot;&amp;amp;&quot;, &quot;&amp;&quot;);
+        dataElement = dataElement.replaceAll(&quot;&amp;apos;&quot;, &quot;'&quot;);
+        dataElement = dataElement.replaceAll(&quot;&amp;quot;&quot;, &quot;\&quot;&quot;);
+        return dataElement;
+    }
+
+    public boolean isPermissive()
+    {
+        return permissive;
+    }
+
+    public void setPermissive(boolean permissive)
+    {
+        this.permissive = permissive;
+    }
+
+}
</ins><span class="cx">\ No newline at end of file
</span></span></pre>
</div>
</div>

</body>
</html>