blob: 4c2684fee32c12a7749bd1ec7222b1a9b173d56e [file] [log] [blame]
#include "network_backends.h"
#ifdef USE_WRITEV
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <netdb.h>
#include <string.h>
#include <stdlib.h>
#include <limits.h>
#include <stdio.h>
#include <assert.h>
#include "network.h"
#include "fdevent.h"
#include "log.h"
#include "stat_cache.h"
#if 0
#define LOCAL_BUFFERING 1
#endif
int network_write_chunkqueue_writev(server *srv, connection *con, int fd, chunkqueue *cq) {
chunk *c;
size_t chunks_written = 0;
for(c = cq->first; c; c = c->next) {
int chunk_finished = 0;
switch(c->type) {
case MEM_CHUNK: {
char * offset;
size_t toSend;
ssize_t r;
size_t num_chunks, i;
struct iovec *chunks;
chunk *tc;
size_t num_bytes = 0;
#if defined(_SC_IOV_MAX) /* IRIX, MacOS X, FreeBSD, Solaris, ... */
const size_t max_chunks = sysconf(_SC_IOV_MAX);
#elif defined(IOV_MAX) /* Linux x86 (glibc-2.3.6-3) */
const size_t max_chunks = IOV_MAX;
#elif defined(MAX_IOVEC) /* Linux ia64 (glibc-2.3.3-98.28) */
const size_t max_chunks = MAX_IOVEC;
#elif defined(UIO_MAXIOV) /* Linux x86 (glibc-2.2.5-233) */
const size_t max_chunks = UIO_MAXIOV;
#elif (defined(__FreeBSD__) && __FreeBSD_version < 500000) || defined(__DragonFly__) || defined(__APPLE__)
/* - FreeBSD 4.x
* - MacOS X 10.3.x
* (covered in -DKERNEL)
* */
const size_t max_chunks = 1024; /* UIO_MAXIOV value from sys/uio.h */
#else
#error "sysconf() doesnt return _SC_IOV_MAX ..., check the output of 'man writev' for the EINVAL error and send the output to jan@kneschke.de"
#endif
/* we can't send more then SSIZE_MAX bytes in one chunk */
/* build writev list
*
* 1. limit: num_chunks < max_chunks
* 2. limit: num_bytes < SSIZE_MAX
*/
for (num_chunks = 0, tc = c; tc && tc->type == MEM_CHUNK && num_chunks < max_chunks; num_chunks++, tc = tc->next);
chunks = calloc(num_chunks, sizeof(*chunks));
for(tc = c, i = 0; i < num_chunks; tc = tc->next, i++) {
if (tc->mem->used == 0) {
chunks[i].iov_base = tc->mem->ptr;
chunks[i].iov_len = 0;
} else {
offset = tc->mem->ptr + tc->offset;
toSend = tc->mem->used - 1 - tc->offset;
chunks[i].iov_base = offset;
/* protect the return value of writev() */
if (toSend > SSIZE_MAX ||
num_bytes + toSend > SSIZE_MAX) {
chunks[i].iov_len = SSIZE_MAX - num_bytes;
num_chunks = i + 1;
break;
} else {
chunks[i].iov_len = toSend;
}
num_bytes += toSend;
}
}
if ((r = writev(fd, chunks, num_chunks)) < 0) {
switch (errno) {
case EAGAIN:
case EINTR:
r = 0;
break;
case EPIPE:
case ECONNRESET:
free(chunks);
return -2;
default:
log_error_write(srv, __FILE__, __LINE__, "ssd",
"writev failed:", strerror(errno), fd);
free(chunks);
return -1;
}
}
cq->bytes_out += r;
/* check which chunks have been written */
for(i = 0, tc = c; i < num_chunks; i++, tc = tc->next) {
if (r >= (ssize_t)chunks[i].iov_len) {
/* written */
r -= chunks[i].iov_len;
tc->offset += chunks[i].iov_len;
if (chunk_finished) {
/* skip the chunks from further touches */
chunks_written++;
c = c->next;
} else {
/* chunks_written + c = c->next is done in the for()*/
chunk_finished++;
}
} else {
/* partially written */
tc->offset += r;
chunk_finished = 0;
break;
}
}
free(chunks);
break;
}
case FILE_CHUNK: {
ssize_t r;
off_t abs_offset;
off_t toSend;
stat_cache_entry *sce = NULL;
#define KByte * 1024
#define MByte * 1024 KByte
#define GByte * 1024 MByte
const off_t we_want_to_mmap = 512 KByte;
char *start = NULL;
if (HANDLER_ERROR == stat_cache_get_entry(srv, con, c->file.name, &sce)) {
log_error_write(srv, __FILE__, __LINE__, "sb",
strerror(errno), c->file.name);
return -1;
}
abs_offset = c->file.start + c->offset;
if (abs_offset > sce->st.st_size) {
log_error_write(srv, __FILE__, __LINE__, "sb",
"file was shrinked:", c->file.name);
return -1;
}
/* mmap the buffer
* - first mmap
* - new mmap as the we are at the end of the last one */
if (c->file.mmap.start == MAP_FAILED ||
abs_offset == (off_t)(c->file.mmap.offset + c->file.mmap.length)) {
/* Optimizations for the future:
*
* adaptive mem-mapping
* the problem:
* we mmap() the whole file. If someone has alot large files and 32bit
* machine the virtual address area will be unrun and we will have a failing
* mmap() call.
* solution:
* only mmap 16M in one chunk and move the window as soon as we have finished
* the first 8M
*
* read-ahead buffering
* the problem:
* sending out several large files in parallel trashes the read-ahead of the
* kernel leading to long wait-for-seek times.
* solutions: (increasing complexity)
* 1. use madvise
* 2. use a internal read-ahead buffer in the chunk-structure
* 3. use non-blocking IO for file-transfers
* */
/* all mmap()ed areas are 512kb expect the last which might be smaller */
off_t we_want_to_send;
size_t to_mmap;
/* this is a remap, move the mmap-offset */
if (c->file.mmap.start != MAP_FAILED) {
munmap(c->file.mmap.start, c->file.mmap.length);
c->file.mmap.offset += we_want_to_mmap;
} else {
/* in case the range-offset is after the first mmap()ed area we skip the area */
c->file.mmap.offset = 0;
while (c->file.mmap.offset + we_want_to_mmap < c->file.start) {
c->file.mmap.offset += we_want_to_mmap;
}
}
/* length is rel, c->offset too, assume there is no limit at the mmap-boundaries */
we_want_to_send = c->file.length - c->offset;
to_mmap = (c->file.start + c->file.length) - c->file.mmap.offset;
/* we have more to send than we can mmap() at once */
if (abs_offset + we_want_to_send > c->file.mmap.offset + we_want_to_mmap) {
we_want_to_send = (c->file.mmap.offset + we_want_to_mmap) - abs_offset;
to_mmap = we_want_to_mmap;
}
if (-1 == c->file.fd) { /* open the file if not already open */
if (-1 == (c->file.fd = open(c->file.name->ptr, O_RDONLY))) {
log_error_write(srv, __FILE__, __LINE__, "sbs", "open failed for:", c->file.name, strerror(errno));
return -1;
}
#ifdef FD_CLOEXEC
fcntl(c->file.fd, F_SETFD, FD_CLOEXEC);
#endif
}
if (MAP_FAILED == (c->file.mmap.start = mmap(0, to_mmap, PROT_READ, MAP_SHARED, c->file.fd, c->file.mmap.offset))) {
/* close it here, otherwise we'd have to set FD_CLOEXEC */
log_error_write(srv, __FILE__, __LINE__, "ssbd", "mmap failed:",
strerror(errno), c->file.name, c->file.fd);
return -1;
}
c->file.mmap.length = to_mmap;
#ifdef LOCAL_BUFFERING
buffer_copy_string_len(c->mem, c->file.mmap.start, c->file.mmap.length);
#else
#ifdef HAVE_MADVISE
/* don't advise files < 64Kb */
if (c->file.mmap.length > (64 KByte)) {
/* darwin 7 is returning EINVAL all the time and I don't know how to
* detect this at runtime.i
*
* ignore the return value for now */
madvise(c->file.mmap.start, c->file.mmap.length, MADV_WILLNEED);
}
#endif
#endif
/* chunk_reset() or chunk_free() will cleanup for us */
}
/* to_send = abs_mmap_end - abs_offset */
toSend = (c->file.mmap.offset + c->file.mmap.length) - (abs_offset);
if (toSend < 0) {
log_error_write(srv, __FILE__, __LINE__, "soooo",
"toSend is negative:",
toSend,
c->file.mmap.length,
abs_offset,
c->file.mmap.offset);
assert(toSend < 0);
}
#ifdef LOCAL_BUFFERING
start = c->mem->ptr;
#else
start = c->file.mmap.start;
#endif
if ((r = write(fd, start + (abs_offset - c->file.mmap.offset), toSend)) < 0) {
switch (errno) {
case EAGAIN:
case EINTR:
r = 0;
break;
case EPIPE:
case ECONNRESET:
return -2;
default:
log_error_write(srv, __FILE__, __LINE__, "ssd",
"write failed:", strerror(errno), fd);
return -1;
}
}
c->offset += r;
cq->bytes_out += r;
if (c->offset == c->file.length) {
chunk_finished = 1;
/* we don't need the mmaping anymore */
if (c->file.mmap.start != MAP_FAILED) {
munmap(c->file.mmap.start, c->file.mmap.length);
c->file.mmap.start = MAP_FAILED;
}
}
break;
}
default:
log_error_write(srv, __FILE__, __LINE__, "ds", c, "type not known");
return -1;
}
if (!chunk_finished) {
/* not finished yet */
break;
}
chunks_written++;
}
return chunks_written;
}
#endif