Problem
I wrote a quick-select filter in C on Wednesday. (The code is below.)
It is a filter in the UNIX tradition:
-
It reads from standard input
k
, the rank of the integer to select,n
, the number of elements, and thenn
integers. -
It outputs the
k
th highest integer.
(A perhaps better design would take k
as an argument—but a bigger gripe of mine is having to give n
! Bonus points if an answer can show me how to avoid reading n
from stdin or as an argument. I would read until EOF, but then I would have to deal with a growing array, I think.)
For the unfamiliar, quickselect can find the k
th highest integer in O(n)
time. The naïve selection algorithm sorts in place and then accesses element k
, for O(n lg n)
runtime. Quickselect does not sort large lists and achieves linear performance.
kth_largest.c
/*d. ben knoble 2019.10.02*/
/*
* PROGRAM
*
* k_largest
*
* Finds the kth largest element of an array in O(n) time.
*
* Reads from standard input
*
* - k: which element to find
* - n: number of elements
* - n elements
*
* Outputs kth largest element to standard out.
*
* IMPLEMENTATION
*
* see kth_largest function
*/
#include <assert.h>
#include <err.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
int compare_longs(const void *a, const void *b) {
long x = *(long *)a;
long y = *(long *)b;
if (x < y)
return -1;
else if (x > y)
return 1;
return 0;
}
void sort(long *a, size_t n) {
qsort(a, n, sizeof(long), compare_longs);
}
size_t med_index(size_t i) {
return (size_t)(floor(i/2));
}
void swap(long *a, long *b) {
long t = *a;
*a = *b;
*b = t;
}
size_t partition(long pivot, long *array, size_t n_elts) {
// find the pivot
size_t pos = -1;
for (size_t i = 0; i < n_elts; ++i) {
if (array[i] == pivot) {
pos = i;
break;
}
}
assert(pos >= 0);
swap(&array[pos], &array[n_elts-1]);
// now pivot is at the end of the array
size_t i = 0;
for (size_t j = 0; j < n_elts; ++j) {
if (array[j] < pivot) {
swap(&array[j], &array[i]);
++i;
}
}
swap(&array[i], &array[n_elts-1]);
return i;
}
long kth_largest(size_t k, long *array, size_t n_elts) {
size_t n_sublists = n_elts/5;
long *medians = malloc(n_sublists * sizeof(long));
if (medians == NULL) { err(1, NULL); }
// sort sublists of 5 elements
for (size_t i = 0; i < n_sublists; ++i) {
size_t start = i*5;
size_t left = n_elts - start < 5 ? n_elts - start : 5;
sort(&array[start], left);
if (left == 5)
medians[i] = array[start + 3];
else
medians[i] = array[start + med_index(left)];
}
// determine pivot (possibly recursively)
long pivot;
if (n_sublists < 5)
pivot = medians[med_index(n_sublists)];
else
pivot = kth_largest(med_index(n_sublists), medians, n_sublists);
// partition
size_t rank = partition(pivot, array, n_elts);
if (k < rank)
return kth_largest(k, array, rank-1);
else if (k > rank)
return kth_largest(rank - k, &array[rank+1], n_elts-(rank+1));
// else k == rank
return pivot;
}
int main(int argc, char **argv) {
size_t k;
size_t n_elts;
scanf("%zd", &k);
scanf("%zd", &n_elts);
long *array = malloc(n_elts * sizeof(long));
if (array == NULL) { err(1, NULL); }
for (size_t i = 0; i < n_elts; ++i)
scanf("%ld", &array[i]);
printf("%ldn", kth_largest(k, array, n_elts));
}
Solution
Major stuff
Consider const
Design: kth_largest()
has a side effect of re-arranging array
. this is surprising and not part of “Finds the kth largest element of an array in O(n) time.” I’d expect code to do the job without the side effect
// long kth_largest(size_t k, long *array, size_t n_elts) {
long kth_largest(size_t k, const long *array, size_t n_elts) {
Small size error
Code may call err(1, NULL);
when n_elts < 5
as malloc(0)
can return NULL
. I’d expect code to work for sizes 1 to 4 also.
Doubtful O() claim
“Finds the kth largest element of an array in O(n) time.” Perhaps when n
>>> k
, but not in general. I’d expect O(n*k)
. So perhaps
Finds the kth largest element of an array in O(n) time when k <<< n
otherwise O(n*k)
No error check
Do not trust user input follows the rules nor allocations always succeed.
long *array = malloc(n_elts * sizeof(long));
if (array == NULL) {
Handle_Error();
}
for (size_t i = 0; i < n_elts; ++i) {
// scanf("%ld", &array[i]);
if (scanf("%ld", &array[i]) != 1) {
Handle_Error();
}
}
Minor stuff
object vs type
Rather than size to the type, size ot the object. Easier to code right, review and maintain.
// qsort(a, n, sizeof(long), compare_longs);
qsort(a, n, sizeof *a, compare_longs);
// long *medians = malloc(n_sublists * sizeof(long));
long *medians = malloc(sizeof *medians * n_sublists);
No need for floating point math
i/2
will already have “floor” the quotient before the floor()
. Absolutely no need for floor()
and with its potential loss if precision for large i
.
// return (size_t)(floor(i/2));
return i/2;
size_t
is an unsigned type
Enable all warnings
pos >= 0
is always true below. A well enabled compilers would have warned.
size_t pos = -1;
...
assert(pos >= 0);
Match specifier and type
"%zd"
does not match a size_t
. "%zu"
does. Also implies useful warnings were not enabled. Save time – enable warnings. See also How to use “zd” specifier with printf()
?/
// scanf("%zd", &k);
// scanf("%zd", &n_elts);
scanf("%zu", &k);
scanf("%zu", &n_elts);
Read until EOF
“I would read until EOF, but then I would have to deal with a growing array, I think.” –> Yes, good idea. Confident this is up to D. Ben Knoble abilities, so will not post code.
Design idea: pass the k
as an argv[]
and the array via stdin
.