Skip to content
This repository was archived by the owner on Jul 14, 2020. It is now read-only.

Commit 05295c6

Browse files
author
Ioan Eugen Stan
committedFeb 15, 2012
Added a work CharBuffer
1 parent 5b31df1 commit 05295c6

File tree

3 files changed

+34
-27
lines changed

3 files changed

+34
-27
lines changed
 

‎nbactions.xml

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<actions>
3+
<action>
4+
<actionName>run</actionName>
5+
<goals>
6+
<goal>process-classes</goal>
7+
<goal>org.codehaus.mojo:exec-maven-plugin:1.2:exec</goal>
8+
</goals>
9+
<properties>
10+
<exec.args>-classpath %classpath ro.ieugen.mboxiterator.MboxIteratorExample</exec.args>
11+
<exec.executable>java</exec.executable>
12+
<exec.classpathScope>runtime</exec.classpathScope>
13+
</properties>
14+
</action>
15+
</actions>

‎src/main/java/ro/ieugen/mboxiterator/MboxIterator.java

+17-25
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,13 @@
1-
/****************************************************************
2-
* Licensed to the Apache Software Foundation (ASF) under one *
3-
* or more contributor license agreements. See the NOTICE file *
4-
* distributed with this work for additional information *
5-
* regarding copyright ownership. The ASF licenses this file *
6-
* to you under the Apache License, Version 2.0 (the *
7-
* "License"); you may not use this file except in compliance *
8-
* with the License. You may obtain a copy of the License at *
9-
* *
10-
* http://www.apache.org/licenses/LICENSE-2.0 *
11-
* *
12-
* Unless required by applicable law or agreed to in writing, *
13-
* software distributed under the License is distributed on an *
14-
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
15-
* KIND, either express or implied. See the License for the *
16-
* specific language governing permissions and limitations *
17-
* under the License. *
18-
****************************************************************/
1+
/**
2+
* **************************************************************
3+
* Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file *
4+
* distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you
5+
* under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You
6+
* may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or
7+
* agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR
8+
* CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and
9+
* limitations * under the License. * **************************************************************
10+
*/
1911
package ro.ieugen.mboxiterator;
2012

2113
import java.io.*;
@@ -24,6 +16,7 @@
2416
import java.nio.channels.FileChannel;
2517
import java.nio.charset.Charset;
2618
import java.nio.charset.CharsetDecoder;
19+
import java.nio.charset.CoderResult;
2720
import java.util.Iterator;
2821
import java.util.regex.Matcher;
2922
import java.util.regex.Pattern;
@@ -38,6 +31,7 @@
3831
public class MboxIterator implements Iterable<CharBuffer>, Closeable {
3932

4033
private static final Logger LOG = LoggerFactory.getLogger(MboxIterator.class);
34+
private static final int MAX_MSG_LENGTH = 1024 * 1024 * 10; // 10Mb of Chars!
4135
private final FileInputStream fis;
4236
private final CharBuffer mboxCharBuffer;
4337
private final Matcher fromLineMathcer;
@@ -56,11 +50,9 @@ private MboxIterator(final File mbox,
5650
final MappedByteBuffer byteBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0,
5751
fileChannel.size());
5852
final CharsetDecoder DECODER = Charset.forName(charset).newDecoder();
59-
/*TODO: DECODER.decode() this will try to decode the whole file.
60-
* It could be problematic if the file is large (~2gb).
61-
* Improve this by working with chunks.
62-
*/
63-
mboxCharBuffer = DECODER.decode(byteBuffer);
53+
mboxCharBuffer = CharBuffer.allocate(MAX_MSG_LENGTH);
54+
CoderResult result = DECODER.decode(byteBuffer, mboxCharBuffer, false);
55+
6456
final Pattern MESSAGE_START = Pattern.compile(regexpPattern, regexpFlags);
6557
fromLineMathcer = MESSAGE_START.matcher(mboxCharBuffer);
6658
hasMore = fromLineMathcer.find();
@@ -95,10 +87,10 @@ public boolean hasNext() {
9587

9688
@Override
9789
public CharBuffer next() {
98-
LOG.info("next() called at offset {}", fromLineMathcer.start());
90+
//LOG.info("next() called at offset {}", fromLineMathcer.start());
9991
final CharBuffer message = mboxCharBuffer.slice();
10092
message.position(fromLineMathcer.start());
101-
logBufferDetails(message);
93+
//logBufferDetails(message);
10294
hasMore = fromLineMathcer.find();
10395
if (hasMore) {
10496
LOG.info("We limit the buffer to {} ?? {}",

‎src/main/java/ro/ieugen/mboxiterator/MboxIteratorExample.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,10 @@ public class MboxIteratorExample {
4141

4242
// simple example of how to split an mbox into individual files
4343
public static void main(String[] args) throws IOException, FileNotFoundException {
44-
final File mbox = new File("/home/ieugen/contracte/firimituri/gmane.test.yahoo/test-utf");
44+
final File mbox = new File("/home/estan/gmail2.mbox");
4545
long start = System.currentTimeMillis();
4646
int count = 0;
47-
for (CharBuffer buf : new MboxIterator.Builder(mbox).build()) {
47+
for (CharBuffer buf : new MboxIterator.Builder(mbox).charset("ISO-8859-1").build()) {
4848
FileOutputStream fout = new FileOutputStream(new File("target/messages/msg-" + count));
4949
FileChannel fileChannel = fout.getChannel();
5050
ByteBuffer buf2 = ENCODER.encode(buf);

0 commit comments

Comments
 (0)
This repository has been archived.