Merge Sort & External Sort in C

/* merge sort and external sort */
/* EXTMRG.C */
#include 
#include 
#include 

/****************************
 * implementation dependent *
 ****************************/

/* template for workfiles (8.3 format) */
#define FNAME "_sort%03d.dat"
#define LNAME 13

/* comparison operators */
#define compLT(x,y) (x < y)
#define compGT(x,y) (x > y)

/* define the record to be sorted here */
#define LRECL 100
typedef int keyType;
typedef struct recTypeTag {
    keyType key;                                /* sort key for record */
    #if LRECL
        char data[LRECL-sizeof(keyType)];       /* other fields */
    #endif
} recType;

/******************************
 * implementation independent *
 ******************************/

typedef enum {false, true} bool;

typedef struct tmpFileTag {
    FILE *fp;                   /* file pointer */
    char name[LNAME];           /* filename */
    recType rec;                /* last record read */
    int dummy;                  /* number of dummy runs */
    bool eof;                   /* end-of-file flag */
    bool eor;                   /* end-of-run flag */
    bool valid;                 /* true if rec is valid */
    int fib;                    /* ideal fibonacci number */
} tmpFileType;

static tmpFileType **file;      /* array of file info for tmp files */
static int nTmpFiles;           /* number of tmp files */
static char *ifName;            /* input filename */
static char *ofName;            /* output filename */

static int level;               /* level of runs */
static int nNodes;              /* number of nodes for selection tree */

void deleteTmpFiles(void) {
    int i;

    /* delete merge files and free resources */
    if (file) {
        for (i = 0; i < nTmpFiles; i++) {
            if (file[i]) {
                if (file[i]->fp) fclose(file[i]->fp);
                if (*file[i]->name) remove(file[i]->name);
                free (file[i]);
            }
        }
        free (file);
    }
}

void termTmpFiles(int rc) {

    /* cleanup files */
    remove(ofName);
    if (rc == 0) {
        int fileT;

        /* file[T] contains results */
        fileT = nTmpFiles - 1;
        fclose(file[fileT]->fp); file[fileT]->fp = NULL;
        if (rename(file[fileT]->name, ofName)) {
            perror("io1");
            deleteTmpFiles();
            exit(1);
        }
        *file[fileT]->name = 0;
    }
    deleteTmpFiles();
}

void cleanExit(int rc) {

    /* cleanup tmp files and exit */
    termTmpFiles(rc);
    exit(rc);
}

void *safeMalloc(size_t size) {
    void *p;

    /* safely allocate memory and initialize to zero */
    if ((p = calloc(1, size)) == NULL) {
        printf("error: malloc failed, size = %d\n", size);
        cleanExit(1);
    }
    return p;
}

void initTmpFiles(void) {
    int i;
    tmpFileType *fileInfo;

    /* initialize merge files */
    if (nTmpFiles < 3) nTmpFiles = 3;
    file = safeMalloc(nTmpFiles * sizeof(tmpFileType*));
    fileInfo = safeMalloc(nTmpFiles * sizeof(tmpFileType));
    for (i = 0; i < nTmpFiles; i++) {
        file[i] = fileInfo + i;
        sprintf(file[i]->name, FNAME, i);
        if ((file[i]->fp = fopen(file[i]->name, "w+b")) == NULL) {
            perror("io2");
            cleanExit(1);
        }
    }
}

recType *readRec(void) {

    typedef struct iNodeTag {   /* internal node */
        struct iNodeTag *parent;/* parent of internal node */
        struct eNodeTag *loser; /* external loser */
    } iNodeType;

    typedef struct eNodeTag {   /* external node */
        struct iNodeTag *parent;/* parent of external node */
        recType rec;            /* input record */
        int run;                /* run number */
        bool valid;             /* input record is valid */
    } eNodeType;

    typedef struct nodeTag {
        iNodeType i;            /* internal node */
        eNodeType e;            /* external node */
    } nodeType;

    static nodeType *node;      /* array of selection tree nodes */
    static eNodeType *win;      /* new winner */
    static FILE *ifp;           /* input file */
    static bool eof;            /* true if end-of-file, input */
    static int maxRun;          /* maximum run number */
    static int curRun;          /* current run number */
    iNodeType *p;               /* pointer to internal nodes */
    static bool lastKeyValid;   /* true if lastKey is valid */
    static keyType lastKey;     /* last key written */

    /* read next record using replacement selection */

    /* check for first call */
    if (node == NULL) {
        int i;

        if (nNodes < 2) nNodes = 2;
        node = safeMalloc(nNodes * sizeof(nodeType));
        for (i = 0; i < nNodes; i++) {
            node[i].i.loser = &node[i].e;
            node[i].i.parent = &node[i/2].i;
            node[i].e.parent = &node[(nNodes + i)/2].i;
            node[i].e.run = 0;
            node[i].e.valid = false;
        }
        win = &node[0].e;
        lastKeyValid = false;

        if ((ifp = fopen(ifName, "rb")) == NULL) {
            printf("error: file %s, unable to open\n", ifName);
            cleanExit(1);
        }
    }

    while (1) {

        /* replace previous winner with new record */
        if (!eof) {
            if (fread(&win->rec, sizeof(recType), 1, ifp) == 1) {
                if ((!lastKeyValid || compLT(win->rec.key, lastKey))
                && (++win->run > maxRun))
                    maxRun = win->run;
                win->valid = true;
            } else if (feof(ifp)) {
                fclose(ifp);
                eof = true;
                win->valid = false;
                win->run = maxRun + 1;
            } else {
                perror("io4");
                cleanExit(1);
            } 
        } else {
            win->valid = false;
            win->run = maxRun + 1;
        }

        /* adjust loser and winner pointers */
        p = win->parent;
        do {
            bool swap;
            swap = false;
            if (p->loser->run < win->run) {
                swap = true;
            } else if (p->loser->run == win->run) {
                if (p->loser->valid && win->valid) {
                    if (compLT(p->loser->rec.key, win->rec.key))
                        swap = true;
                } else {
                    swap = true;
                }
            }
            if (swap) {
                /* p should be winner */
                eNodeType *t;

                t = p->loser;
                p->loser = win;
                win = t;
            }
            p = p->parent;
        } while (p != &node[0].i);

        /* end of run? */
        if (win->run != curRun) {
            /* win->run = curRun + 1 */
            if (win->run > maxRun) {
                /* end of output */
                free(node);
                return NULL;
            }
            curRun = win->run;
        }

        /* output top of tree */
        if (win->run) {
            lastKey = win->rec.key;
            lastKeyValid = true;
            return &win->rec;
        }
    }
}

void makeRuns(void) {
    recType *win;       /* winner */
    int fileT;          /* last file */
    int fileP;          /* next to last file */
    int j;              /* selects file[j] */


    /* Make initial runs using replacement selection.
     * Runs are written using a Fibonacci distintbution.
     */

    /* initialize file structures */
    fileT = nTmpFiles - 1;
    fileP = fileT - 1;
    for (j = 0; j < fileT; j++) {
        file[j]->fib = 1;
        file[j]->dummy = 1;
    }
    file[fileT]->fib = 0;
    file[fileT]->dummy = 0;

    level = 1;
    j = 0;


    win = readRec();
    while (win) {
        bool anyrun;

        anyrun = false;
        for (j = 0; win && j <= fileP; j++) {
            bool run;

            run = false;
            if (file[j]->valid) {
                if (!compLT(win->key, file[j]->rec.key)) {
                    /* append to an existing run */
                    run = true;
                } else if (file[j]->dummy) {
                    /* start a new run */
                    file[j]->dummy--;
                    run = true;
                }
            } else {
                /* first run in file */
                file[j]->dummy--;
                run = true;
            }

            if (run) {
                anyrun = true;

                /* flush run */
                while(1) {
                    if (fwrite(win, sizeof(recType), 1, file[j]->fp) != 1) {
                        perror("io3");
                        cleanExit(1);
                    }
                    file[j]->rec.key = win->key;
                    file[j]->valid = true;
                    if ((win = readRec()) == NULL) break;
                    if (compLT(win->key, file[j]->rec.key)) break;
                }
            }
        }

        /* if no room for runs, up a level */
        if (!anyrun) {
            int t;
            level++;
            t = file[0]->fib;
            for (j = 0; j <= fileP; j++) {
                file[j]->dummy = t + file[j+1]->fib - file[j]->fib;
                file[j]->fib = t + file[j+1]->fib; 
            }
        }
    }
}

void rewindFile(int j) {
    /* rewind file[j] and read in first record */
    file[j]->eor = false;
    file[j]->eof = false;
    rewind(file[j]->fp);
    if (fread(&file[j]->rec, sizeof(recType), 1, file[j]->fp) != 1) {
        if (feof(file[j]->fp)) {
            file[j]->eor = true;
            file[j]->eof = true;
        } else {
            perror("io5");
            cleanExit(1);
        }
    }
}

void mergeSort(void) {
    int fileT;
    int fileP;
    int j;
    tmpFileType *tfile;

    /* polyphase merge sort */

    fileT = nTmpFiles - 1;
    fileP = fileT - 1;

    /* prime the files */
    for (j = 0; j < fileT; j++) {
        rewindFile(j);
    }

    /* each pass through loop merges one run */
    while (level) {
        while(1) {
            bool allDummies;
            bool anyRuns;

            /* scan for runs */
            allDummies = true;
            anyRuns = false;
            for (j = 0; j <= fileP; j++) {
                if (!file[j]->dummy) {
                    allDummies = false;
                    if (!file[j]->eof) anyRuns = true;
                }
            }

            if (anyRuns) {
                int k;
                keyType lastKey;

                /* merge 1 run file[0]..file[P] --> file[T] */

                while(1) {
                    /* each pass thru loop writes 1 record to file[fileT] */

                    /* find smallest key */
                    k = -1;
                    for (j = 0; j <= fileP; j++) {
                        if (file[j]->eor) continue;
                        if (file[j]->dummy) continue;
                        if (k < 0 || 
                        (k != j && compGT(file[k]->rec.key, file[j]->rec.key)))
                            k = j;
                    }
                    if (k < 0) break;

                    /* write record[k] to file[fileT] */
                    if (fwrite(&file[k]->rec, sizeof(recType), 1, 
                            file[fileT]->fp) != 1) {
                        perror("io6");
                        cleanExit(1);
                    }

                    /* replace record[k] */
                    lastKey = file[k]->rec.key;
                    if (fread(&file[k]->rec, sizeof(recType), 1,
                            file[k]->fp) == 1) {
                        /* check for end of run on file[s] */
                        if (compLT(file[k]->rec.key, lastKey))
                            file[k]->eor = true;
                    } else if (feof(file[k]->fp)) {
                        file[k]->eof = true;
                        file[k]->eor = true;
                    } else {
                        perror("io7");
                        cleanExit(1);
                    }
                }

                /* fixup dummies */
                for (j = 0; j <= fileP; j++) {
                    if (file[j]->dummy) file[j]->dummy--;
                    if (!file[j]->eof) file[j]->eor = false;
                }

            } else if (allDummies) {
                for (j = 0; j <= fileP; j++)
                    file[j]->dummy--;
                file[fileT]->dummy++;
            }

            /* end of run */
            if (file[fileP]->eof && !file[fileP]->dummy) {
                /* completed a fibonocci-level */
                level--;
                if (!level) {
                    /* we're done, file[fileT] contains data */
                    return;
                }

                /* fileP is exhausted, reopen as new */
                fclose(file[fileP]->fp);
                if ((file[fileP]->fp = fopen(file[fileP]->name, "w+b"))
                        == NULL) {
                    perror("io8");
                    cleanExit(1);
                }
                file[fileP]->eof = false;
                file[fileP]->eor = false;

                rewindFile(fileT);

                /* f[0],f[1]...,f[fileT] <-- f[fileT],f[0]...,f[T-1] */
                tfile = file[fileT];
                memmove(file + 1, file, fileT * sizeof(tmpFileType*));
                file[0] = tfile;

                /* start new runs */
                for (j = 0; j <= fileP; j++)
                    if (!file[j]->eof) file[j]->eor = false;
            }
        }

    }
}


void extSort(void) {
    initTmpFiles();
    makeRuns();
    mergeSort();
    termTmpFiles(0);
}

int main(int argc, char *argv[]) {

    /* command-line:
     *
     *   ext ifName ofName nTmpFiles nNodes
     *
     *   ext in.dat out.dat 5 2000
     *       reads in.dat, sorts using 5 files and 2000 nodes, output to out.dat
     */
    if (argc != 5) {
        printf("%s ifName ofName nTmpFiles nNodes\n", argv[0]);
        cleanExit(1);
    }

    ifName = argv[1];
    ofName = argv[2];
    nTmpFiles = atoi(argv[3]);
    nNodes = atoi(argv[4]);

    printf("extSort: nFiles=%d, nNodes=%d, lrecl=%d\n",
        nTmpFiles, nNodes, sizeof(recType));

    extSort();

    return 0;
}

Top