You are losing most of your speed by calling a function for every single pixel. Here's how I would write it:
typedef struct { long topleft, pitch, xres, yres; } frame_t;
static void drawrect_32bpp (frame_t *dd, long x0, long y0, long x1, long y1, long col)
{
long *lptr;
x0 = max(x0,0); x1 = min(x1,dd->xres);
y0 = max(y0,0); y1 = min(y1,dd->yres);
lptr = (long *)(y0*dd->pitch + dd->topleft);
for(y1-=y0;y1>0;y1--,lptr=(long *)(((long)lptr)+dd->pitch))
for(y0=x0;y0<x1;y0++) lptr[y0] = col;
}
You could use a lookup table for the initial calculation of lptr, but I've found it to not always be faster.