One of the C programs I was working on this weekend had to find all files that satisfy a certain predicate and add them to a list of “pending work”. The first thing that comes to mind is probably a custom opendir(), readdir(), closedir() hack. This is probably ok when one only has these structures, but it also a bit cumbersome. There are various sorts of DIR and dirent structures and recursing down a large path requires manually keeping track of a lot of state.
A small program that uses opendir() and its friends to traverse a hierarchy of files and print their names may look like this:
% cat -n opendir-sample.c
1 #include <sys/types.h>
2
3 #include <sys/stat.h>
4
5 #include <assert.h>
6 #include <dirent.h>
7 #include <limits.h>
8 #include <stdio.h>
9 #include <string.h>
10
11 static int ptree(char *curpath, char * const path);
12
13 int
14 main(int argc, char * const argv[])
15 {
16 int k;
17 int rval;
18
19 for (rval = 0, k = 1; k < argc; k++)
20 if (ptree(NULL, argv[k]) != 0)
21 rval = 1;
22 return rval;
23 }
24
25 static int
26 ptree(char *curpath, char * const path)
27 {
28 char ep[PATH_MAX];
29 char p[PATH_MAX];
30 DIR *dirp;
31 struct dirent entry;
32 struct dirent *endp;
33 struct stat st;
34
35 if (curpath != NULL)
36 snprintf(ep, sizeof(ep), "%s/%s", curpath, path);
37 else
38 snprintf(ep, sizeof(ep), "%s", path);
39 if (stat(ep, &st) == -1)
40 return -1;
41 if ((dirp = opendir(ep)) == NULL)
42 return -1;
43 for (;;) {
44 endp = NULL;
45 if (readdir_r(dirp, &entry, &endp) == -1) {
46 closedir(dirp);
47 return -1;
48 }
49 if (endp == NULL)
50 break;
51 assert(endp == &entry);
52 if (strcmp(entry.d_name, ".") == 0 ||
53 strcmp(entry.d_name, "..") == 0)
54 continue;
55 if (curpath != NULL)
56 snprintf(ep, sizeof(ep), "%s/%s/%s", curpath,
57 path, entry.d_name);
58 else
59 snprintf(ep, sizeof(ep), "%s/%s", path,
60 entry.d_name);
61 if (stat(ep, &st) == -1) {
62 closedir(dirp);
63 return -1;
64 }
65 if (S_ISREG(st.st_mode) || S_ISDIR(st.st_mode)) {
66 printf("%c %s\n", S_ISDIR(st.st_mode) ? 'd' : 'f', ep);
67 }
68 if (S_ISDIR(st.st_mode) == 0)
69 continue;
70 if (curpath != NULL)
71 snprintf(p, sizeof(p), "%s/%s", curpath, path);
72 else
73 snprintf(p, sizeof(p), "%s", path);
74 snprintf(ep, sizeof(ep), "%s", entry.d_name);
75 ptree(p, ep);
76 }
77 closedir(dirp);
78 return 0;
79 }
With more than 80 lines, this looks a bit too complex for the simple task it does. It has to keep a lot of temporary state information around in the two ep[] and p[] buffers, and all the manual work of setting updating and maintaining this internal state is adding so much noise around the actual printf() statement at line 68 that it is almost too hard to understand what this particular bit of code is supposed to do.
The program still “works”, in a way, so if you compile and run it, the expected results come up:
keramida@kobe:/home/keramida$ cc -O2 opendir-sample.c keramida@kobe:/home/keramida$ ./a.out /tmp d /tmp/.snap d /tmp/.X11-unix d /tmp/.XIM-unix d /tmp/.ICE-unix d /tmp/.font-unix f /tmp/aprtdTjbX f /tmp/aprEdWP4d d /tmp/fam-gdm f /tmp/.X0-lock d /tmp/fam-keramida d /tmp/.esd-1000 d /tmp/screens d /tmp/screens/S-root d /tmp/screens/S-keramida d /tmp/emacs1000 f /tmp/a f /tmp/b f /tmp/kot f /tmp/logsort keramida@kobe:/home/keramida$ ./a.out /tmp /etc/defaults d /tmp/.snap d /tmp/.X11-unix d /tmp/.XIM-unix d /tmp/.ICE-unix d /tmp/.font-unix f /tmp/aprtdTjbX f /tmp/aprEdWP4d d /tmp/fam-gdm f /tmp/.X0-lock d /tmp/fam-keramida d /tmp/.esd-1000 d /tmp/screens d /tmp/screens/S-root d /tmp/screens/S-keramida d /tmp/emacs1000 f /tmp/a f /tmp/b f /tmp/kot f /tmp/logsort f /etc/defaults/rc.conf f /etc/defaults/bluetooth.device.conf f /etc/defaults/devfs.rules f /etc/defaults/periodic.conf
But this program looks “ugly”. Fortunately, the BSDs and Linux provide a more elegant interface for traversing file hierarchies: the fts(3) family of functions. A similar program that uses fts(3) to traverse the filesystem hierarchies rooted at the arguments of main() is:
% cat -n fts-sample.c
1 #include <sys/types.h>
2
3 #include <sys/stat.h>
4
5 #include <err.h>
6 #include <fts.h>
7 #include <stdio.h>
8
9 static int ptree(char * const argv[]);
10
11 int
12 main(int argc, char * const argv[])
13 {
14 int rc;
15
16 if ((rc = ptree(argv + 1)) != 0)
17 rc = 1;
18 return rc;
19 }
20
21 static int
22 ptree(char * const argv[])
23 {
24 FTS *ftsp;
25 FTSENT *p, *chp;
26 int fts_options = FTS_COMFOLLOW | FTS_LOGICAL | FTS_NOCHDIR;
27 int rval = 0;
28
29 if ((ftsp = fts_open(argv, fts_options, NULL)) == NULL) {
30 warn("fts_open");
31 return -1;
32 }
33 /* Initialize ftsp with as many argv[] parts as possible. */
34 chp = fts_children(ftsp, 0);
35 if (chp == NULL) {
36 return 0; /* no files to traverse */
37 }
38 while ((p = fts_read(ftsp)) != NULL) {
39 switch (p->fts_info) {
40 case FTS_D:
41 printf("d %s\n", p->fts_path);
42 break;
43 case FTS_F:
44 printf("f %s\n", p->fts_path);
45 break;
46 default:
47 break;
48 }
49 }
50 fts_close(ftsp);
51 return 0;
52 }
This version is not particularly smaller; it’s only 34-35% smaller in LOC. It is, however, far more elegant and a lot easier to read:
- By using a higher level interface, the program is shorter and easier to understand.
- By using simpler constructs in the fts_read() loop, it very obvious what the program does for each file type (file vs. directory).
- The FTS_COMFOLLOW flag sets up things for following symbolic links in one simple place (something entirely missing from the opendir version).
- There are no obvious bugs about copying half of a pathname, or forgetting to recurse in some cases, or forgetting to print some directory because of a complex interaction between superfluous bits of code. Simpler is also less prone to bugs in this case.
So the next time you are about to build a filesystem traversal toolset from scratch, you can avoid all the pain (and bugs): use fts(3)! :-)
Hey Giorgos :)
Thanks for yet another informative post. POSIX mandates a similar interface, ftw(3). But, to quote Issue 7, “The ftw() function may be removed in a future version.”. On the other hand it doesn’t say whether it will be replaced by some other interface.
Anyway, ftw(3) is implemented on top of fts(3) in NetBSD that I checked.
Cheers,
Stathis
Comment by enorl — 2009-07-05 @ 09:05:59 |