]> gitweb.pimeys.fr Git - scripts-20-100.git/blob - mails/mailbox_to_maildir.pl
typo
[scripts-20-100.git] / mails / mailbox_to_maildir.pl
1 #!/usr/bin/perl -w
2 #
3
4 #
5 # Trouvé sur le web pour transformer un fichier mailbox en
6 # un dossier au format maildir
7 #
8
9 # $Id: mb2md.pl,v 1.26 2004/03/28 00:09:46 juri Exp $
10 #
11 # mb2md-3.20.pl Converts Mbox mailboxes to Maildir format.
12 #
13 # Public domain.
14 #
15 # currently maintained by:
16 # Juri Haberland <juri@koschikode.com>
17 # initially wrote by:
18 # Robin Whittle
19 #
20 # This script's web abode is http://batleth.sapienti-sat.org/projects/mb2md/ .
21 # For a changelog see http://batleth.sapienti-sat.org/projects/mb2md/changelog.txt
22 #
23 # The Mbox -> Maildir inner loop is based on qmail's script mbox2maildir, which
24 # was kludged by Ivan Kohler in 1997 from convertandcreate (public domain)
25 # by Russel Nelson. Both these convert a single mailspool file.
26 #
27 # The qmail distribution has a maildir2mbox.c program.
28 #
29 # What is does:
30 # =============
31 #
32 # Reads a directory full of Mbox format mailboxes and creates a set of
33 # Maildir format mailboxes. Some details of this are to suit Courier
34 # IMAP's naming conventions for Maildir mailboxes.
35 #
36 # http://www.inter7.com/courierimap/
37 #
38 # This is intended to automate the conversion of the old
39 # /var/spool/mail/blah file - with one call of this script - and to
40 # convert one or more mailboxes in a specifed directory with separate
41 # calls with other command line arguments.
42 #
43 # Run this as the user - in these examples "blah".
44
45 # This version supports conversion of:
46 #
47 # Date The date-time in the "From " line of the message in the
48 # Mbox format is the date when the message was *received*.
49 # This is transformed into the date-time of the file which
50 # contains the message in the Maildir mailbox.
51 #
52 # This relies on the Date::Parse perl module and the utime
53 # perl function.
54 #
55 # The script tries to cope with errant forms of the
56 # Mbox "From " line which it may encounter, but if
57 # there is something really screwy in a From line,
58 # then perhaps the script will fail when "touch"
59 # is given an invalid date. Please report the
60 # exact nature of any such "From " line!
61 #
62 #
63 # Flagged
64 # Replied
65 # Read = Seen
66 # Tagged for Deletion
67 #
68 # In the Mbox message, flags for these are found in the
69 # "Status: N" or "X-Status: N" headers, where "N" is 0
70 # or more of the following characters in the left column.
71 #
72 # They are converted to characters in the right column,
73 # which become the last characters of the file name,
74 # following the ":2," which indicates IMAP message status.
75 #
76 #
77 # F -> F Flagged
78 # A -> R Replied
79 # R -> S Read = Seen
80 # D -> T Tagged for Deletion (Trash)
81 #
82 # This is based on the work of Philip Mak who wrote a
83 # completely separate Mbox -> Maildir converter called
84 # perfect_maildir and posted it to the Mutt-users mailing
85 # list on 25 December 2001:
86 #
87 # http://www.mail-archive.com/mutt-users@mutt.org/msg21872.html
88 #
89 # Michael Best originally integrated those changes into mb2md.
90 #
91 #
92 # In addition, the names of the message files in the Maildir are of a
93 # regular length and are of the form:
94 #
95 # 7654321.000123.mbox:2,xxx
96 #
97 # Where "7654321" is the Unix time in seconds when the script was
98 # run and "000123" is the six zeroes padded message number as
99 # messages are converted from the Mbox file. "xxx" represents zero or
100 # more of the above flags F, R, S or T.
101 #
102 #
103 # ---------------------------------------------------------------------
104 #
105 #
106 # USAGE
107 # =====
108 #
109 # Run this as the user of the mailboxes, not as root.
110 #
111 #
112 # mb2md -h
113 # mb2md [-c] -m [-d destdir]
114 # mb2md [-c] -s sourcefile [-d destdir]
115 # mb2md [-c] -s sourcedir [-l wu-mailboxlist] [-R|-f somefolder] [-d destdir] [-r strip_extension]
116 #
117 # -c use the Content-Length: headers (if present) to find the
118 # beginning of the next message
119 # Use with caution! Results may be unreliable. I recommend to do
120 # a run without "-c" first and only use it if you are certain,
121 # that the mbox in question really needs the "-c" option
122 #
123 # -m If this is used then the source will
124 # be the single mailbox at /var/spool/mail/blah for
125 # user blah and the destination mailbox will be the
126 # "destdir" mailbox itself.
127 #
128 #
129 # -s source Directory or file relative to the user's home directory,
130 # which is where the the "somefolders" directories are located.
131 # Or if starting with a "/" it is taken as a
132 # absolute path, e.g. /mnt/oldmail/user
133 #
134 # or
135 #
136 # A single mbox file which will be converted to
137 # the destdir.
138 #
139 # -R If defined, do not skip directories found in a mailbox
140 # directory, but runs recursively into each of them,
141 # creating all wanted folders in Maildir.
142 # Incompatible with '-f'
143 #
144 # -f somefolder Directories, relative to "sourcedir" where the Mbox files
145 # are. All mailboxes in the "sourcedir"
146 # directory will be converted and placed in the
147 # "destdir" directory. (Typically the Inbox directory
148 # which in this instance is also functioning as a
149 # folder for other mailboxes.)
150 #
151 # The "somefolder" directory
152 # name will be encoded into the new mailboxes' names.
153 # See the examples below.
154 #
155 # This does not save an UW IMAP dummy message file
156 # at the start of the Mbox file. Small changes
157 # in the code could adapt it for looking for
158 # other distinctive patterns of dummy messages too.
159 #
160 # Don't let the source directory you give as "somefolders"
161 # contain any "."s in its name, unless you want to
162 # create subfolders from the IMAP user's point of
163 # view. See the example below.
164 #
165 # Incompatible with '-f'
166 #
167 #
168 # -d destdir Directory where the Maildir format directories will be created.
169 # If not given, then the destination will be ~/Maildir .
170 # Typically, this is what the IMAP server sees as the
171 # Inbox and the folder for all user mailboxes.
172 # If this begins with a '/' the path is considered to be
173 # absolute, otherwise it is relative to the users
174 # home directory.
175 #
176 # -r strip_ext If defined this extension will be stripped from
177 # the original mailbox file name before creating
178 # the corresponding maildir. The extension must be
179 # given without the leading dot ("."). See the example below.
180 #
181 # -l WU-file File containing the list of subscribed folders. If
182 # migrating from WU-IMAP the list of subscribed folders will
183 # be found in the file called .mailboxlist in the users
184 # home directory. This will convert all subscribed folders
185 # for a single user:
186 # /bin/mb2md -s mail -l .mailboxlist -R -d Maildir
187 # and for all users in a directory as root you can do the
188 # following:
189 # for i in *; do echo $i;su - $i -c "/bin/mb2md -s mail -l .mailboxlist -R -d Maildir";done
190 #
191 #
192 # Example
193 # =======
194 #
195 # We have a bunch of directories of Mbox mailboxes located at
196 # /home/blah/oldmail/
197 #
198 # /home/blah/oldmail/fffff
199 # /home/blah/oldmail/ggggg
200 # /home/blah/oldmail/xxx/aaaa
201 # /home/blah/oldmail/xxx/bbbb
202 # /home/blah/oldmail/xxx/cccc
203 # /home/blah/oldmail/xxx/dddd
204 # /home/blah/oldmail/yyyy/huey
205 # /home/blah/oldmail/yyyy/duey
206 # /home/blah/oldmail/yyyy/louie
207 #
208 # With the UW IMAP server, fffff and ggggg would have appeared in the root
209 # of this mail server, along with the Inbox. aaaa, bbbb etc, would have
210 # appeared in a folder called xxx from that root, and xxx was just a folder
211 # not a mailbox for storing messages.
212 #
213 # We also have the mailspool Inbox at:
214 #
215 # /var/spool/mail/blah
216 #
217 #
218 # To convert these, as user blah, we give the first command:
219 #
220 # mb2md -m
221 #
222 # The main Maildir directory will be created if it does not exist.
223 # (This is true of any argument options, not just "-m".)
224 #
225 # /home/blah/Maildir/
226 #
227 # It has the following subdirectories:
228 #
229 # /home/blah/Maildir/tmp/
230 # /home/blah/Maildir/new/
231 # /home/blah/Maildir/cur/
232 #
233 # Then /var/spool/blah file is read, split into individual files and
234 # written into /home/blah/Maildir/cur/ .
235 #
236 # Now we give the second command:
237 #
238 # mb2md -s oldmail -R
239 #
240 # This reads recursively all Mbox mailboxes and creates:
241 #
242 # /home/blah/Maildir/.fffff/
243 # /home/blah/Maildir/.ggggg/
244 # /home/blah/Maildir/.xxx/
245 # /home/blah/Maildir/.xxx.aaaa/
246 # /home/blah/Maildir/.xxx.bbbb/
247 # /home/blah/Maildir/.xxx.cccc/
248 # /home/blah/Maildir/.xxx.aaaa/
249 # /home/blah/Maildir/.yyyy/
250 # /home/blah/Maildir/.yyyy.huey/
251 # /home/blah/Maildir/.yyyy.duey/
252 # /home/blah/Maildir/.yyyy.louie/
253 #
254 # The result, from the IMAP client's point of view is:
255 #
256 # Inbox -----------------
257 # |
258 # | fffff -----------
259 # | ggggg -----------
260 # |
261 # - xxx -------------
262 # | | aaaa --------
263 # | | bbbb --------
264 # | | cccc --------
265 # | | dddd --------
266 # |
267 # - yyyy ------------
268 # | huey -------
269 # | duey -------
270 # | louie ------
271 #
272 # Note that although ~/Maildir/.xxx/ and ~/Maildir/.yyyy may appear
273 # as folders to the IMAP client the above commands to not generate
274 # any Maildir folders of these names. These are simply elements
275 # of the names of other Maildir directories. (if you used '-R', they
276 # whill be able to act as normal folders, containing messages AND folders)
277 #
278 # With a separate run of this script, using just the "-s" option
279 # without "-f" nor "-R", it would be possible to create mailboxes which
280 # appear at the same location as far as the IMAP client is
281 # concerned. By having Mbox mailboxes in some directory:
282 # ~/oldmail/nnn/ of the form:
283 #
284 # /home/blah/oldmail/nn/xxxx
285 # /home/blah/oldmail/nn/yyyyy
286 #
287 # then the command:
288 #
289 # mb2md -s oldmail/nn
290 #
291 # will create two new Maildirs:
292 #
293 # /home/blah/Maildir/.xxx/
294 # /home/blah/Maildir/.yyyy/
295 #
296 # Then what used to be the xxx and yyyy folders now function as
297 # mailboxes too. Netscape 4.77 needed to be put to sleep and given ECT
298 # to recognise this - deleting the contents of (Win2k example):
299 #
300 # C:\Program Files\Netscape\Users\uu\ImapMail\aaa.bbb.ccc\
301 #
302 # where "uu" is the user and "aaa.bbb.ccc" is the IMAP server
303 #
304 # I often find that deleting all this directory's contents, except
305 # "rules.dat", forces Netscape back to reality after its IMAP innards
306 # have become twisted. Then maybe use File > Subscribe - but this
307 # seems incapable of subscribing to folders.
308 #
309 # For Outlook Express, select the mail server, then click the
310 # "IMAP Folders" button and use "Reset list". In the "All"
311 # window, select the mailboxes you want to see in normal
312 # usage.
313 #
314 #
315 # This script did not recurse subdirectories or delete old mailboxes, before addition of the '-R' parameter :)
316 #
317 # Be sure not to be accessing the Mbox mailboxes while running this
318 # script. It does not attempt to lock them. Likewise, don't run two
319 # copies of this script either.
320 #
321 #
322 # Trickier usage . . .
323 # ====================
324 #
325 # If you have a bunch of mailboxes in a directory ~/oldmail/doors/
326 # and you want them to appear in folders such as:
327 #
328 # ~/Maildir/.music.bands.doors.Jim
329 # ~/Maildir/.music.bands.doors.John
330 #
331 # etc. so they appear in an IMAP folder:
332 #
333 # Inbox -----------------
334 # | music
335 # | bands
336 # | doors
337 # | Jim
338 # | John
339 # | Robbie
340 # | Ray
341 #
342 # Then you could rename the source directory to:
343 #
344 # ~/oldmail/music.bands.doors/
345 #
346 # then use:
347 #
348 # mb2md -s oldmail -f music.bands.doors
349 #
350 #
351 # Or simply use '-R' switch with:
352 # mb2md -s oldmail -R
353 #
354 #
355 # Stripping mailbox extensions:
356 # =============================
357 #
358 # If you want to convert mailboxes that came for example from
359 # a Windows box than you might want to strip the extension of
360 # the mailbox name so that it won't create a subfolder in your
361 # mail clients view.
362 #
363 # Example:
364 # You have several mailboxes named Trash.mbx, Sent.mbx, Drafts.mbx
365 # If you don't strip the extension "mbx" you will get the following
366 # hierarchy:
367 #
368 # Inbox
369 # |
370 # - Trash
371 # | | mbx
372 # |
373 # - Sent
374 # | | mbx
375 # |
376 # - Drafts
377 # | mbx
378 #
379 # This is more than ugly!
380 # Just use:
381 # mb2md -s oldmail -r mbx
382 #
383 # Note: don't specify the dot! It will be stripped off
384 # automagically ;)
385 #
386 #------------------------------------------------------------------------------
387
388
389 use strict;
390 use Getopt::Std;
391 use Date::Parse;
392 use IO::Handle;
393 use Fcntl;
394
395 # print the usage message
396 sub usage() {
397 print "Usage:\n";
398 print " mb2md -h\n";
399 print " mb2md [-c] -m [-d destdir]\n";
400 print " mb2md [-c] -s sourcefile [-d destdir]\n";
401 die " mb2md [-c] -s sourcedir [-l wu-mailboxlist] [-R|-f somefolder] [-d destdir] [-r strip_extension]\n";
402 }
403 # get options
404 my %opts;
405 getopts('d:f:chms:r:l:R', \%opts) || usage();
406 usage() if ( defined($opts{h})
407 || (!defined($opts{m}) && !defined($opts{s})) );
408
409 # Get uid, username and home dir
410 my ($name, $passwd, $uid, $gid, $quota, $comment, $gcos, $homedir, $shell) = getpwuid($<);
411
412 # Get arguments and determine source
413 # and target directories.
414 my $mbroot = undef; # this is the base directory for the mboxes
415 my $mbdir = undef; # this is an mbox dir relative to the $mbroot
416 my $mbfile = undef; # this is an mbox file
417 my $dest = undef;
418 my $strip_ext = undef;
419 my $use_cl = undef; # defines whether we use the Content-Length: header if present
420
421 # if option "-c" is given, we use the Content-Length: header if present
422 # dangerous! may be unreliable, as the whole CL stuff is a bad idea
423 if (defined($opts{c}))
424 {
425 $use_cl = 1;
426 } else {
427 $use_cl = 0;
428 }
429
430 # first, if the user has gone the -m option
431 # we simply convert their mailfile
432 if (defined($opts{m}))
433 {
434 if (defined($ENV{'MAIL'})) {
435 $mbfile = $ENV{'MAIL'};
436 } elsif ( -f "/var/spool/mail/$name" ) {
437 $mbfile = "/var/spool/mail/$name"
438 } elsif ( -f "/var/mail/$name" ) {
439 $mbfile = "/var/mail/$name"
440 } else {
441 die("I searched \$MAIL, /var/spool/mail/$name and /var/mail/$name, ".
442 "but I couldn't find your mail spool file - ");
443 }
444 }
445 # see if the user has specified a source directory
446 elsif (defined($opts{s}))
447 {
448 # if opts{s} doesn't start with a "/" then
449 # it is a subdir of the users $home
450 # if it does start with a "/" then
451 # let's take $mbroot as a absolut path
452 $opts{s} = "$homedir/$opts{s}" if ($opts{s} !~ /^\//);
453
454 # check if the given source is a mbox file
455 if (-f $opts{s})
456 {
457 $mbfile = $opts{s};
458 }
459
460 # otherwise check if it is a directory
461 elsif (-d $opts{s})
462 {
463 $mbroot = $opts{s};
464 # get rid of trailing /'s
465 $mbroot =~ s/\/$//;
466
467 # check if we have a specified sub directory,
468 # otherwise the sub directory is '.'
469 if (defined($opts{f}))
470 {
471 $mbdir = $opts{f};
472 # get rid of trailing /'s
473 $mbdir =~ s/\/$//;
474 }
475 }
476
477 # otherwise we have an error
478 else
479 {
480 die("Fatal: Source is not an mbox file or a directory!\n");
481 }
482 }
483
484
485 # get the dest
486 defined($opts{d}) && ($dest = $opts{d}) || ($dest = "Maildir");
487 # see if we have anything to strip
488 defined($opts{r}) && ($strip_ext = $opts{r});
489 # No '-f' with '-R'
490 if((defined($opts{R}))&&(defined($opts{f}))) { die "No recursion with \"-f\"";}
491 # Get list of folders
492 my @flist;
493 if(defined($opts{l}))
494 {
495 open (LIST,$opts{l}) or die "Could not open mailbox list $opts{l}: $!";
496 @flist=<LIST>;
497 close LIST;
498 }
499
500 # if the destination is relative to the home dir,
501 # check that the home dir exists
502 die("Fatal: home dir $homedir doesn't exist.\n") if ($dest !~ /^\// && ! -e $homedir);
503
504 #
505 # form the destination value
506 # slap the home dir on the front of the dest if the dest does not begin
507 # with a '/'
508 $dest = "$homedir/$dest" if ($dest !~ /^\//);
509 # get rid of trailing /'s
510 $dest =~ s/\/$//;
511
512
513 # Count the number of mailboxes, or
514 # at least files, we found.
515 my $mailboxcount = 0;
516
517 # Since we'll be making sub directories of the main
518 # Maildir, we need to make sure that the main maildir
519 # exists
520 &maildirmake($dest);
521
522 # Now we do different things depending on whether we convert one mbox
523 # file or a directory of mbox files
524 if (defined($mbfile))
525 {
526 if (!isamailboxfile($mbfile))
527 {
528 print "Skipping $mbfile: not a mbox file\n";
529 }
530 else
531 {
532 print "Converting $mbfile to maildir: $dest\n";
533 # this is easy, we just run the convert function
534 &convert($mbfile, $dest);
535 }
536 }
537 # if '-f' was used ...
538 elsif (defined($mbdir))
539 {
540 print "Converting mboxdir/mbdir: $mbroot/$mbdir to maildir: $dest/\n";
541
542 # Now set our source directory
543 my $sourcedir = "$mbroot/$mbdir";
544
545 # check that the directory we are supposed to be finding mbox
546 # files in, exists and is a directory
547 -e $sourcedir or die("Fatal: MBDIR directory $sourcedir/ does not exist.\n");
548 -d $sourcedir or die("Fatal: MBDIR $sourcedir is not a directory.\n");
549
550
551 &convertit($mbdir,"");
552 }
553 # Else, let's work in $mbroot
554 else
555 {
556 opendir(SDIR, $mbroot)
557 or die("Fatal: Cannot open source directory $mbroot/ \n");
558
559
560 while (my $sourcefile = readdir(SDIR))
561 {
562 if (-d "$mbroot/$sourcefile") {
563 # Recurse only if requested (to be changed ?)
564 if (defined($opts{R})) {
565 print "convertit($sourcefile,\"\")\n";
566 &convertit($sourcefile,"");
567 } else {
568 print("$sourcefile is a directory, but '-R' was not used... skipping\n");
569 }
570 }
571 elsif (!-f "$mbroot/$sourcefile")
572 {
573 print "Skipping $mbroot/$sourcefile : not a file nor a dir\n";
574 next;
575 }
576 elsif (!isamailboxfile("$mbroot/$sourcefile"))
577 {
578 print "Skipping $mbroot/$sourcefile : not a mbox file\n";
579 next;
580 }
581 else
582 {
583 &convertit($sourcefile,"");
584 }
585 } # end of "while ($sfile = readdir(SDIR))" loop.
586 closedir(SDIR);
587 printf("$mailboxcount files processed.\n");
588 }
589 #
590
591 exit 0;
592
593 # My debbugging placeholder I can put somewhere to show how far the script ran.
594 # die("So far so good.\n\n");
595
596 # The isamailboxfile function
597 # ----------------------
598 #
599 # Here we check if the file is a mailbox file, not an address-book or
600 # something else.
601 # If file is empty, we say it is a mbox, to create it empty.
602 #
603 # Returns 1 if file is said mbox, 0 else.
604 sub isamailboxfile {
605 my ($mbxfile) = @_;
606 return 1 if(-z $mbxfile);
607 sysopen(MBXFILE, "$mbxfile", O_RDONLY) or die "Could not open $mbxfile ! \n";
608 while(<MBXFILE>) {
609 if (/^From/) {
610 close(MBXFILE);
611 return 1;
612 }
613 else {
614 close(MBXFILE);
615 return 0;
616 }
617 }
618 }
619
620 # The convertit function
621 # -----------------------
622 #
623 # This function creates all subdirs in maildir, and calls convert()
624 # for each mbox file.
625 # Yes, it becomes the 'main loop' :)
626 sub convertit
627 {
628 # Get subdir as argument
629 my ($dir,$oldpath) = @_;
630
631 $oldpath =~ s/\/\///;
632
633 # Skip files beginning with '.' since they are
634 # not normally mbox files nor dirs (includes '.' and '..')
635 if ($dir =~ /^\./)
636 {
637 print "Skipping $dir : name begins with a '.'\n";
638 return;
639 }
640 my $destinationdir = $dir;
641 my $temppath = $oldpath;
642
643 # We don't want to have .'s in the $targetfile file
644 # name because they will become directories in the
645 # Maildir. Therefore we convert them to _'s
646 $temppath =~ s/\./\_/g;
647 $destinationdir =~ s/\./\_/g;
648
649 # Appending $oldpath => path is only missing $dest
650 $destinationdir = "$temppath.$destinationdir";
651
652 # Converting '/' to '.' in $destinationdir
653 $destinationdir =~s/\/+/\./g;
654
655 # source dir
656 my $srcdir="$mbroot/$oldpath/$dir";
657
658 printf("convertit(): Converting $dir in $mbroot/$oldpath to $dest/$destinationdir\n");
659 &maildirmake("$dest/$destinationdir");
660 print("destination = $destinationdir\n");
661 if (-d $srcdir) {
662 opendir(SUBDIR, "$srcdir") or die "can't open $srcdir !\n";
663 my @subdirlist=readdir(SUBDIR);
664 closedir(SUBDIR);
665 foreach (@subdirlist) {
666 next if (/^\.+$/);
667 print("Sub: $_\n");
668 print("convertit($_,\"$oldpath/$dir\")\n");
669 &convertit($_,"$oldpath/$dir");
670 }
671 } else {
672 # Source file verifs ....
673 #
674 return if(defined($opts{l}) && !inlist("$oldpath/$dir",@flist));
675
676 if (!isamailboxfile("$mbroot/$oldpath/$dir"))
677 {
678 print "Skipping $dir (is not mbox)\n";
679 next;
680 }
681
682 # target file verifs...
683 #
684 # if $strip_extension is defined,
685 # strip it off the $targetfile
686 defined($strip_ext) && ($destinationdir =~ s/\.$strip_ext$//);
687 &convert("$mbroot/$oldpath/$dir","$dest/$destinationdir");
688 $mailboxcount++;
689 }
690 }
691 # The maildirmake function
692 # ------------------------
693 #
694 # It does the same thing that the maildirmake binary that
695 # comes with courier-imap distribution
696 #
697 sub maildirmake
698 {
699 foreach(@_) {
700 -d $_ or mkdir $_,0700 or die("Fatal: Directory $_ doesn't exist and can't be created.\n");
701
702 -d "$_/tmp" or mkdir("$_/tmp",0700) or die("Fatal: Unable to make $_/tmp/ subdirectory.\n");
703 -d "$_/new" or mkdir("$_/new",0700) or die("Fatal: Unable to make $_/new/ subdirectory.\n");
704 -d "$_/cur" or mkdir("$_/cur",0700) or die("Fatal: Unable to make $_/cur/ subdirectory.\n");
705 }
706 }
707
708 # The inlist function
709 # ------------------------
710 #
711 # It checks that the folder to be converted is in the list of subscribed
712 # folders in WU-IMAP
713 #
714 sub inlist
715 {
716 my ($file,@flist) = @_;
717 my $valid = 0;
718 # Get rid of the first / if any
719 $file =~ s/^\///;
720 foreach my $folder (@flist) {
721 chomp $folder;
722 if ($file eq $folder) {
723 $valid = 1;
724 last;
725 }
726 }
727 if (!$valid) {
728 print "$file is not in list\n";
729 }
730 else {
731 print "$file is in list\n";
732 }
733
734 return $valid;
735 }
736
737 #
738
739 # The convert function
740 # ---------------------
741 #
742 # This function does the down and dirty work of
743 # actually converting the mbox to a maildir
744 #
745 sub convert
746 {
747 # get the source and destination as arguments
748 my ($mbox, $maildir) = @_;
749
750 printf("Source Mbox is $mbox\n");
751 printf("Target Maildir is $maildir \n") ;
752
753 # create the directories for the new maildir
754 #
755 # if it is the root maildir (ie. converting the inbox)
756 # these already exist but thats not a big issue
757
758 &maildirmake($maildir);
759
760 # Change to the target mailbox directory.
761
762 chdir "$maildir" ;
763
764 # Converts a Mbox to multiple files
765 # in a Maildir.
766 # This is adapted from mbox2maildir.
767 #
768 # Open the Mbox mailbox file.
769
770
771 if (sysopen(MBOX, "$mbox", O_RDONLY))
772 {
773 #printf("Converting Mbox $mbox . . . \n");
774 }
775 else
776 {
777 die("Fatal: unable to open input mailbox file: $mbox ! \n");
778 }
779
780 # This loop scans the input mailbox for
781 # a line starting with "From ". The
782 # "^" before it is pattern-matching
783 # lingo for it being at the start of a
784 # line.
785 #
786 # Each email in Mbox mailbox starts
787 # with such a line, which is why any
788 # such line in the body of the email
789 # has to have a ">" put in front of it.
790 #
791 # This is not required in a Maildir
792 # mailbox, and some majik below
793 # finds any such quoted "> From"s and
794 # gets rid of the "> " quote.
795 #
796 # Each email is put in a file
797 # in the cur/ subdirectory with a
798 # name of the form:
799 #
800 # nnnnnnnnn.cccc.mbox:2,XXXX
801 #
802 # where:
803 # "nnnnnnnnn" is the Unix time since
804 # 1970 when this script started
805 # running, incremented by 1 for
806 # every email. This is to ensure
807 # unique names for each message
808 # file.
809 #
810 # ".cccc" is the message count of
811 # messages from this mbox.
812 #
813 # ".mbox" is just to indicate that
814 # this message was converted from
815 # an Mbox mailbox.
816 #
817 # ":2," is the start of potentially
818 # multiple IMAP flag characters
819 # "XXXX", but may be followed by
820 # nothing.
821 #
822 # This is sort-of compliant with
823 # the Maildir naming conventions
824 # specified at:
825 #
826 # http://www.qmail.org/man/man5/maildir.html
827 #
828 # This approach does not involve the
829 # process ID or the hostname, but it is
830 # probably good enough.
831 #
832 # When the IMAP server looks at this
833 # mailbox, it will move the files to
834 # the cur/ directory and change their
835 # names as it pleases. In the case
836 # of Courier IMAP, the names will
837 # become like:
838 #
839 # 995096541.25351.mbox:2,S
840 #
841 # with 25351 being Courier IMAP's
842 # process ID. The :2, is the start
843 # of the flags, and the "S" means
844 # that this one has been seen by
845 # the user. (But is this the same
846 # meaning as the user actually
847 # having opened the message to see
848 # its contents, rather than just the
849 # IMAP server having been asked to
850 # list the message's Subject etc.
851 # so the client could list it in the
852 # visible Inbox?)
853 #
854 # This contrasts with a message
855 # created by Courier IMAP, say with
856 # a message copy, which is like:
857 #
858 # 995096541.25351.zair,S=14285:2,S
859 #
860 # where ",S=14285" is the size of the
861 # message in bytes.
862 #
863 # Courier Maildrop's names are similar
864 # but lack the ":2,XXXX" flags . . .
865 # except for my modified Maildrop
866 # which can deliver them with a
867 # ":2,T" - flagged for deletion.
868 #
869 # I have extended the logic of the
870 # per-message inner loop to stop
871 # saving a file for a message with:
872 #
873 # Subject: DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA
874 #
875 # This is the dummy message, always
876 # at the start of an Mbox format
877 # mailbox file - and is put there
878 # by UW IMAPD. Since quite a few
879 # people will use this for
880 # converting from a UW system,
881 # I figure it is worth it.
882 #
883 # I will not save any such message
884 # file for the dummy message.
885 #
886 # Plan
887 # ----
888 #
889 # We want to read the entire Mbox file, whilst
890 # going through a loop for each message we find.
891 #
892 # We want to read all the headers of the message,
893 # starting with the "From " line. For that "From "
894 # line we want to get a date.
895 #
896 # For all other header lines, we want to store them
897 # in $headers whilst parsing them to find:
898 #
899 # 1 - Any flags in the "Status: " or "X-Status: " or
900 # "X-Mozilla-Status: " lines.
901 #
902 # 2 - A subject line indicating this is the dummy message
903 # at the start (typically, but not necessarily) of
904 # the Mbox.
905 #
906 # Once we reach the end of the headers, we will crunch any
907 # flags we found to create a file name. Then, unless this is
908 # the dummy message we create that file and write all the
909 # headers to it.
910 #
911 # Then we continue reading the Mbox, converting ">From " to
912 # "From " and writing it to the file, until we reach one of:
913 #
914 # 1 - Another "From " line (indicating the start of another
915 # message).
916 #
917 # or
918 #
919 # 2 - The end of the Mbox.
920 #
921 # In the former case, which we detect at the start of the loop
922 # we need to close the file and touch it to alter its date-time.
923 #
924 # In the later case, we also need to close the file and touch
925 # it to alter its date-time - but this is beyond the end of the
926 # loop.
927
928
929 # Variables
930 # ---------
931
932 my $messagecount = 0;
933
934 # For generating unique filenames for
935 # each message. Initialise it here with
936 # numeric time in seconds since 1970.
937 my $unique = time;
938
939 # Name of message file to delete if we found that
940 # it was created by reading the Mbox dummy message.
941
942 my $deletedummy = '';
943
944 # To store the complete "From (address) (date-time)
945 # which delineates the start of each message
946 # in the Mbox
947 my $fromline = '';
948
949
950 # Set to 1 when we are reading the header lines,
951 # including the "From " line.
952 #
953 # 0 means we are reading the message body and looking
954 # for another "From " line.
955
956 my $inheaders = 0;
957
958 # Variable to hold all headers (apart from
959 # the first line "From ...." which is not
960 # part of the message itself.
961 my $headers = '';
962
963 # Variable to hold the accumulated characters
964 # we find in header lines of the type:
965 #
966 # Status:
967 # X-Status:
968 # X-Mozilla-Status:
969 # X-Evolution:
970 my $flags = '';
971
972 # To build the file name for the message in.
973 my $messagefn = '';
974
975
976 # The date string from the "From " line of each
977 # message will be written here - and used by
978 # touch to alter the date-time of each message
979 # file. Put non-date text here to make it
980 # spit the dummy if my code fails to find a
981 # date to write into this.
982
983 my $receivedate = 'Bogus';
984
985 # The subject of the message
986 my $subject = '';
987
988 my $previous_line_was_empty = 1;
989
990 # We record the message start line here, for error
991 # reporting.
992 my $startline;
993
994 # If defined, we use this as the number of bytes in the
995 # message body rather than looking for a /^From / line.
996 my $contentlength;
997
998 # A From lines can either occur as the first
999 # line of a file, or after an empty line.
1000 # Most mail systems will quote all From lines
1001 # appearing in the message, but some will only
1002 # do it when necessary.
1003 # Since we initialise the variable to true,
1004 # we don't need to check for beginning of file.
1005
1006 while(<MBOX>)
1007 {
1008 # exchange possible Windows EOL (CRLF) with Unix EOL (LF)
1009 $_ =~ s/\r\n$/\n/;
1010
1011 if ( /^From /
1012 && $previous_line_was_empty
1013 && (!defined $contentlength)
1014 )
1015 {
1016 # We are reading the "From " line which has an
1017 # email address followed by a receive date.
1018 # Turn on the $inheaders flag until we reach
1019 # the end of the headers.
1020
1021 $inheaders = 1;
1022
1023 # record the message start line
1024
1025 $startline = $.;
1026
1027 # If this is not the first run through the loop
1028 # then this means we have already been working
1029 # on a message.
1030
1031 if ($messagecount > 0)
1032 {
1033 # If so, then close that message file and then
1034 # use utime to change its date-time.
1035 #
1036 # Note this code should be duplicated to do
1037 # the same thing at the end of the while loop
1038 # since we must close and touch the final message
1039 # file we were writing when we hit the end of the
1040 # Mbox file.
1041
1042 close (OUT);
1043 if ($messagefn ne '') {
1044 my $t = str2time($receivedate);
1045 utime $t, $t, $messagefn;
1046 }
1047 }
1048
1049 # Because we opened the Mbox file without any
1050 # variable, I think this means that we have its
1051 # current line in Perl's default variable "$_".
1052 # So all sorts of pattern matching magic works
1053 # directly on it.
1054
1055 # We are currently reading the first line starting with
1056 # "From " which contains the date we want.
1057 #
1058 # This will be of the form:
1059 #
1060 # From dduck@test.org Wed Nov 24 11:05:35 1999
1061 #
1062 # at least with UW-IMAP.
1063 #
1064 # However, I did find a nasty exception to this in my
1065 # tests, of the form:
1066 #
1067 # "bounce-MusicNewsletter 5-rw=test.org"@announce2.mp3.com
1068 #
1069 # This makes it trickier to get rid of the email address,
1070 # but I did find a way. I can't rule out that there would
1071 # be some address like this with an "@" in the quoted
1072 # portion too.
1073 #
1074 # Unfortunately, testing with an old Inbox Mbox file,
1075 # I also found an instance where the email address
1076 # had no @ sign at all. It was just an email
1077 # account name, with no host.
1078 #
1079 # I could search for the day of the week. If I skipped
1080 # at least one word of non-whitespace (1 or more contiguous
1081 # non-whitespace characters) then searched for a day of
1082 # the week, then I should be able to avoid almost
1083 # every instance of a day of the week appearing in
1084 # the email address.
1085 #
1086 # Do I need a failsafe arrangement to provide some
1087 # other date to touch if I don't get what seems like
1088 # a date in my resulting string? For now, no.
1089 #
1090 # I will take one approach if there is an @ in the
1091 # "From " line and another (just skip the first word
1092 # after "From ") if there is no @ in the line.
1093 #
1094 # If I knew more about Perl I would probably do it in
1095 # a more elegant way.
1096
1097 # Copy the current line into $fromline.
1098
1099 $fromline = $_;
1100
1101 # Now get rid of the "From ". " =~ s" means substitute.
1102 # Find the word "From " at the start of the line and
1103 # replace it with nothing. The nothing is what is
1104 # between the second and third slash.
1105
1106 $fromline =~ s/^From // ;
1107
1108
1109 # Likewise get rid of the email address.
1110 # This first section is if we determine there is one
1111 # (or more . . . ) "@" characters in the line, which
1112 # would normally be the case.
1113
1114 if ($fromline =~ m/@/)
1115 {
1116 # The line has at least one "@" in it, so we assume
1117 # this is in the middle of an email address.
1118 #
1119 # If the email address had no spaces, then we could
1120 # get rid of the whole thing by searching for any number
1121 # of non-whitespace characters (\S) contiguously, and
1122 # then I think a space. Subsitute nothing for this.
1123 #
1124 # $fromline =~ s/(\S)+ // ;
1125 #
1126 # But we need something to match any number of non-@
1127 # characters, then the "@" and then all the non-whitespace
1128 # characters from there (which takes us to the end of
1129 # "test.org") and then the space following that.
1130 #
1131 # A tutorial on regular expressions is:
1132 #
1133 # http://www.perldoc.com/perl5.6.1/pod/perlretut.html
1134 #
1135 # Get rid of all non-@ characters up to the first "@":
1136
1137 $fromline =~ s/[^@]+//;
1138
1139
1140 # Get rid of the "@".
1141
1142 $fromline =~ s/@//;
1143 }
1144 # If there was an "@" in the line, then we have now
1145 # removed the first one (lets hope there aren't more!)
1146 # and everything which preceded it.
1147 #
1148 # we now remove either something like
1149 # '(foo bar)'. eg. '(no mail address)',
1150 # or everything after the '@' up to the trailing
1151 # timezone
1152 #
1153 # FIXME: all those regexp should be combined to just one single one
1154
1155 $fromline =~ s/(\((\S*| )+\)|\S+) *//;
1156
1157 chomp $fromline;
1158
1159 # Stash the date-time for later use. We will use it
1160 # to touch the file after we have closed it.
1161
1162 $receivedate = $fromline;
1163
1164 # Debugging lines:
1165 #
1166 # print "$receivedate is the receivedate of message $messagecount.\n";
1167 # $receivedate = "Wed Nov 24 11:05:35 1999";
1168 #
1169 # To look at the exact date-time of files:
1170 #
1171 # ls -lFa --full-time
1172 #
1173 # End of handling the "From " line.
1174 }
1175
1176
1177 # Now process header lines which are not the "From " line.
1178
1179 if ( ($inheaders eq 1)
1180 && (! /^From /)
1181 )
1182 {
1183 # Now we are reading the header lines after the "From " line.
1184 # Keep looking for the blank line which indicates the end of the
1185 # headers.
1186
1187
1188 # ".=" means append the current line to the $headers
1189 # variable.
1190 #
1191 # For some reason, I was getting two blank lines
1192 # at the end of the headers, rather than one,
1193 # so I decided not to read in the blank line
1194 # which terminates the headers.
1195 #
1196 # Delete the "unless ($_ eq "\n")" to get rid
1197 # of this kludge.
1198
1199 $headers .= $_ unless ($_ eq "\n");
1200
1201 # Now scan the line for various status flags
1202 # and to fine the Subject line.
1203
1204 $flags .= $1 if /^Status: ([A-Z]+)/;
1205 $flags .= $1 if /^X-Status: ([A-Z]+)/;
1206 if (/^X-Mozilla-Status: ([0-9a-f]{4})/i)
1207 {
1208 $flags .= 'R' if (hex($1) & 0x0001);
1209 $flags .= 'A' if (hex($1) & 0x0002);
1210 $flags .= 'D' if (hex($1) & 0x0008);
1211 }
1212 if(/^X\-Evolution:\s+\w{8}\-(\w{4})/oi)
1213 {
1214 $b = pack("H4", $1); #pack it as 4 digit hex (0x0000)
1215 $b = unpack("B32", $b); #unpack into bit string
1216
1217 # "usually" only the right most six bits are used
1218 # however, I have come across a seventh bit in
1219 # about 15 (out of 10,000) messages with this bit
1220 # activated.
1221 # I have not found any documentation in the source.
1222 # If you find out what it does, please let me know.
1223
1224 # Notes:
1225 # Evolution 1.4 does mark forwarded messages.
1226 # The sixth bit is to denote an attachment
1227
1228 $flags .= 'A' if($b =~ /[01]{15}1/); #replied
1229 $flags .= 'D' if($b =~ /[01]{14}1[01]{1}/); #deleted
1230 $flags .= 'T' if($b =~ /[01]{13}1[01]{2}/); #draft
1231 $flags .= 'F' if($b =~ /[01]{12}1[01]{3}/); #flagged
1232 $flags .= 'R' if($b =~ /[01]{11}1[01]{4}/); #seen/read
1233 }
1234 $subject = $1 if /^Subject: (.*)$/;
1235 if ($use_cl eq 1)
1236 {
1237 $contentlength = $1 if /^Content-Length: (\d+)$/;
1238 }
1239
1240 # Now look out for the end of the headers - a blank
1241 # line. When we find it, create the file name and
1242 # analyse the Subject line.
1243
1244 if ($_ eq "\n")
1245 {
1246 # We are at the end of the headers. Set the
1247 # $inheaders flag back to 0.
1248
1249 $inheaders = 0;
1250
1251 # Include the current newline in the content length
1252
1253 ++$contentlength if defined $contentlength;
1254
1255 # Create the file name for the current message.
1256 #
1257 # A simple version of this would be:
1258 #
1259 # $messagefn = "cur/$unique.$messagecount.mbox:2,";
1260 #
1261 # This would create names with $messagecount values of
1262 # 1, 2, etc. But for neatness when looking at a
1263 # directory of such messages, sorted by filename,
1264 # I want to have leading zeroes on message count, so
1265 # that they would be 000001 etc. This makes them
1266 # appear in message order rather than 1 being after
1267 # 19 etc. So this is good for up to 999,999 messages
1268 # in a mailbox. It is a cosmetic matter for a person
1269 # looking into the Maildir directory manually.
1270 # To do this, use sprintf instead with "%06d" for
1271 # 6 characters of zero-padding:
1272
1273 $messagefn = sprintf ("cur/%d.%06d.mbox:2,", $unique, $messagecount) ;
1274
1275
1276 # Append flag characters to the end of the
1277 # filename, according to flag characters
1278 # collected from the message headers
1279
1280 $messagefn .= 'F' if $flags =~ /F/; # Flagged.
1281 $messagefn .= 'R' if $flags =~ /A/; # Replied to.
1282 $messagefn .= 'S' if $flags =~ /R/; # Seen or Read.
1283 $messagefn .= 'T' if $flags =~ /D/; # Tagged for deletion.
1284
1285
1286 # Opens filename $messagefn for output (>) with filehandle OUT.
1287
1288 open(OUT, ">$messagefn") or die("Fatal: unable to create new message $messagefn");
1289
1290 # Count the messages.
1291
1292 $messagecount++;
1293
1294 # Only for the first message,
1295 # check to see if it is a dummy.
1296 # Delete the message file we
1297 # just created if it was for the
1298 # dummy message at the start
1299 # of the Mbox.
1300 #
1301 # Add search terms as required.
1302 # The last 2 lines are for rent.
1303 #
1304 # "m" means match the regular expression,
1305 # but we can do without it.
1306 #
1307 # Do I need to escape the ' in "DON'T"?
1308 # I didn't in the original version.
1309
1310 if ( (($messagecount == 1) && defined($subject))
1311 && ($subject =~ m/^DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA/)
1312 )
1313 {
1314 # Stash the file name of the dummy message so we
1315 # can delete it later.
1316
1317 $deletedummy = "$messagefn";
1318 }
1319
1320 # Print the collected headers to the message file.
1321
1322 print OUT "$headers";
1323
1324
1325 # Clear $headers and $flags ready for the next message.
1326
1327 $headers = '';
1328 $flags = '';
1329
1330 # End of processing the headers once we found the
1331 # blank line which terminated them
1332 }
1333
1334 # End of dealing with the headers.
1335 }
1336
1337
1338 if ( $inheaders eq 0)
1339 {
1340
1341 # We are now processing the message body.
1342 #
1343 # Now we have passed the headers to the
1344 # output file, we scan until the while
1345 # loop finds another "From " line.
1346
1347 # Decrement our content length if we're
1348 # using it to find the end of the message
1349 # body
1350
1351 if (defined $contentlength) {
1352
1353 # Decrement our $contentlength variable
1354
1355 $contentlength -= length($_);
1356
1357 # The proper end for a message with Content-Length
1358 # specified is the $contentlength variable should
1359 # be exactly -1 and we should be on a bare
1360 # newline. Note that the bare newline is not
1361 # printed to the end of the current message as
1362 # it's actually a message separator in the mbox
1363 # format rather than part of the message. The
1364 # next line _should_ be a From_ line, but just in
1365 # case the Content-Length header is incorrect
1366 # (e.g. a corrupt mailbox), we just continue
1367 # putting lines into the current message until we
1368 # see the next From_ line.
1369
1370 if ($contentlength < 0) {
1371 if ($contentlength == -1 && $_ eq "\n") {
1372 $contentlength = undef;
1373 next;
1374 }
1375 $contentlength = undef;
1376 }
1377 }
1378
1379 #
1380 # We want to copy every part of the message
1381 # body to the output file, except for the
1382 # quoted ">From " lines, which was the
1383 # way the IMAP server encoded body lines
1384 # starting with "From ".
1385 #
1386 # Pattern matching Perl majik to
1387 # get rid of an Mbox quoted From.
1388 #
1389 # This works on the default variable "$_" which
1390 # contains the text from the Mbox mailbox - I
1391 # guess this is the case because of our
1392 # (open(MBOX ....) line above, which did not
1393 # assign this to anything else, so it would go
1394 # to the default variable. This enables
1395 # inscrutably terse Perlisms to follow.
1396 #
1397 # "s" means "Subsitute" and it looks for any
1398 # occurrence of ">From" starting at the start
1399 # of the line. When it finds this, it replaces
1400 # it with "From".
1401 #
1402 # So this finds all instances in the Mbox message
1403 # where the original line started with the word
1404 # "From" but was converted to ">From" in order to
1405 # not be mistaken for the "From ..." line which
1406 # is used to demark each message in the Mbox.
1407 # This was was a destructive conversion because
1408 # any message which originally had ">From" at the
1409 # start of the line, before being put into the
1410 # Mbox, will now have that line without the ">".
1411
1412 s/^>From /From /;
1413
1414 # Glorious tersness here. Thanks Simon for
1415 # explaining this.
1416 #
1417 # "print OUT" means print the default variable to
1418 # the file of file handle OUT. This is where
1419 # the bulk of the message text is written to
1420 # the output file.
1421
1422 print OUT or die("Fatal: unable to write to new message to $messagefn");
1423
1424
1425 # End of the if statement dealing with message body.
1426 }
1427
1428 $previous_line_was_empty = ( $_ eq "\n" );
1429
1430 # End of while (MBOX) loop.
1431 }
1432 # Close the input file.
1433
1434 close(MBOX);
1435
1436 # Close the output file, and duplicate the code
1437 # from the start of the while loop which touches
1438 # the date-time of the most recent message file.
1439
1440 close(OUT);
1441 if ($messagefn ne '') {
1442 my $t = str2time($receivedate);
1443 utime $t, $t, $messagefn;
1444 }
1445
1446 # After all the messages have been
1447 # converted, check to see if the
1448 # first one was a dummy.
1449 # if so, delete it and make
1450 # the message count one less.
1451
1452 if ($deletedummy ne "")
1453 {
1454 printf("Dummy mail system first message detected and not saved.\n");
1455 unlink $deletedummy;
1456
1457 $messagecount--;
1458
1459 }
1460
1461 printf("$messagecount messages.\n\n");
1462 }