From 1c8a862f58d7f28e5b8e8c5658478413ab0becbe Mon Sep 17 00:00:00 2001 From: Angie Hinrichs Date: Wed, 28 Sep 2011 16:18:35 -0700 Subject: [PATCH] Modifications for UCSC Genome Browser (#ifdef KNETFILE_HOOKS): 1. knetfile can become a wrapper on another network library, for example the UCSC Genome Browser's network code which also supports https and sparse-file local caching of data. 2. The EOF check is #ifdef'd out because it costs another access, prints to stderr when it can't seek, and I don't consider it necessary. Hmmm, I should have added an option for it. --- Makefile | 2 +- bam.c | 2 + bam_index.c | 102 ++++++++++++++++++++++++++++++++++++++++++------ bcftools/main.c | 3 +- knetfile.c | 89 +++++++++++++++++++++++++++++++++++++++++- knetfile.h | 55 +++++++++----------------- 6 files changed, 202 insertions(+), 51 deletions(-) diff --git a/Makefile b/Makefile index 99f7eca..1937b85 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ CC= gcc CFLAGS= -g -Wall -O2 #-m64 #-arch ppc -DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=1 +DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=1 -DKNETFILE_HOOKS KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o bedidx.o \ diff --git a/bam.c b/bam.c index 0055e84..69d5dd8 100644 --- a/bam.c +++ b/bam.c @@ -72,6 +72,7 @@ bam_header_t *bam_header_read(bamFile fp) char buf[4]; int magic_len; int32_t i = 1, name_len; +#if 0 // check EOF i = bgzf_check_EOF(fp); if (i < 0) { @@ -80,6 +81,7 @@ bam_header_t *bam_header_read(bamFile fp) if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF"); } else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n"); +#endif // read "BAM1" magic_len = bam_read(fp, buf, 4); if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { diff --git a/bam_index.c b/bam_index.c index 9610a26..246f140 100644 --- a/bam_index.c +++ b/bam_index.c @@ -318,7 +318,55 @@ void bam_index_save(const bam_index_t *idx, FILE *fp) fflush(fp); } -static bam_index_t *bam_index_load_core(FILE *fp) +typedef size_t (*index_read_f)(void *ptr, size_t size, size_t nmemb, void *fp); + +#ifdef KNETFILE_HOOKS +// Use buffered knetfile I/O instead of saving index file to local directory. +#define KNETBUFSIZE (1024 * 1024) +struct knet_buf { + knetFile *fp; // knetFile (may belong to knet_alt_* hooks) + int offset; // offset of first buffered byte that has not been read + int len; // number of unread buffered bytes + unsigned char eof; // set to 1 when we knet_read fewer bytes than expected + unsigned char buf[KNETBUFSIZE]; +}; + +struct knet_buf *knet_buf_new(knetFile *fp) +{ + struct knet_buf *kb = (struct knet_buf *)malloc(sizeof(struct knet_buf)); + kb->fp = fp; + kb->offset = kb->len = kb->eof = 0; + memset(&(kb->buf[0]), 0, KNETBUFSIZE); + return kb; +} + +size_t index_knet_read(void *ptr, size_t size, size_t nmemb, void *fp) +{ + struct knet_buf *kb = fp; + size_t remaining = (size * nmemb); + while (remaining > 0) { + if (kb->len > 0) { + size_t count = (kb->len < remaining) ? kb->len : remaining; + memcpy(ptr, kb->buf+kb->offset, count); + ptr += count; + kb->offset += count; + kb->len -= count; + remaining -= count; + } + if (kb->eof) + break; + if (remaining > 0) { + kb->len = knet_read(kb->fp, kb->buf, KNETBUFSIZE); + kb->offset = 0; + if (kb->len < KNETBUFSIZE) + kb->eof = 1; + } + } + return ((size * nmemb) - remaining) / size; +} +#endif + +static bam_index_t *bam_index_load_core(void *fp, index_read_f index_read) { int i; char magic[4]; @@ -327,14 +375,13 @@ static bam_index_t *bam_index_load_core(FILE *fp) fprintf(stderr, "[bam_index_load_core] fail to load index.\n"); return 0; } - fread(magic, 1, 4, fp); + index_read(magic, 1, 4, fp); if (strncmp(magic, "BAI\1", 4)) { fprintf(stderr, "[bam_index_load] wrong magic number.\n"); - fclose(fp); return 0; } idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); - fread(&idx->n, 4, 1, fp); + index_read(&idx->n, 4, 1, fp); if (bam_is_be) bam_swap_endian_4p(&idx->n); idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t)); @@ -347,18 +394,18 @@ static bam_index_t *bam_index_load_core(FILE *fp) bam_binlist_t *p; index = idx->index[i] = kh_init(i); // load binning index - fread(&size, 4, 1, fp); + index_read(&size, 4, 1, fp); if (bam_is_be) bam_swap_endian_4p(&size); for (j = 0; j < (int)size; ++j) { - fread(&key, 4, 1, fp); + index_read(&key, 4, 1, fp); if (bam_is_be) bam_swap_endian_4p(&key); k = kh_put(i, index, key, &ret); p = &kh_value(index, k); - fread(&p->n, 4, 1, fp); + index_read(&p->n, 4, 1, fp); if (bam_is_be) bam_swap_endian_4p(&p->n); p->m = p->n; p->list = (pair64_t*)malloc(p->m * 16); - fread(p->list, 16, p->n, fp); + index_read(p->list, 16, p->n, fp); if (bam_is_be) { int x; for (x = 0; x < p->n; ++x) { @@ -368,15 +415,15 @@ static bam_index_t *bam_index_load_core(FILE *fp) } } // load linear index - fread(&index2->n, 4, 1, fp); + index_read(&index2->n, 4, 1, fp); if (bam_is_be) bam_swap_endian_4p(&index2->n); index2->m = index2->n; index2->offset = (uint64_t*)calloc(index2->m, 8); - fread(index2->offset, index2->n, 8, fp); + index_read(index2->offset, 8, index2->n, fp); if (bam_is_be) for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); } - if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0; + if (index_read(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0; if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor); return idx; } @@ -406,12 +453,13 @@ bam_index_t *bam_index_load_local(const char *_fn) } free(fnidx); free(fn); if (fp) { - bam_index_t *idx = bam_index_load_core(fp); + bam_index_t *idx = bam_index_load_core(fp, (index_read_f)fread); fclose(fp); return idx; } else return 0; } +#ifndef KNETFILE_HOOKS #ifdef _USE_KNETFILE static void download_from_remote(const char *url) { @@ -449,10 +497,39 @@ static void download_from_remote(const char *url) return; } #endif +#endif bam_index_t *bam_index_load(const char *fn) { bam_index_t *idx; +#if (defined _USE_KNETFILE && defined KNETFILE_HOOKS) + if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn || + strstr(fn, "https://") == fn) { + knetFile *fp; + struct knet_buf *kb; + size_t len = strlen(fn); + char *fnidx = (char*)calloc(len + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bai"); + fp = knet_open(fnidx, "r"); + if (fp == NULL && !strcmp(fn+len-4, ".bam")) { + char *fnidx2 = (char*)calloc(len, 1); + strcpy(fnidx2, fn); + strncpy(fnidx2+len-4, ".bai", 5); + fp = knet_open(fnidx2, "r"); + if (fp == NULL) { + fprintf(stderr, "Unable to open index file for %s. Tried %s and %s.", + fn, fnidx, fnidx2); + return NULL; + } + } + kb = knet_buf_new(fp); + idx = bam_index_load_core(kb, index_knet_read); + knet_close(fp); + free(kb); + } else { + idx = bam_index_load_local(fn); + } +#else idx = bam_index_load_local(fn); if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) { char *fnidx = calloc(strlen(fn) + 5, 1); @@ -461,6 +538,7 @@ bam_index_t *bam_index_load(const char *fn) download_from_remote(fnidx); idx = bam_index_load_local(fn); } +#endif if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n"); return idx; } diff --git a/bcftools/main.c b/bcftools/main.c index fcd94b8..de743ca 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -28,7 +28,7 @@ int bcf_cat(int n, char * const *fn) h = bcf_hdr_read(in); if (i == 0) bcf_hdr_write(out, h); bcf_hdr_destroy(h); -#ifdef _USE_KNETFILE +#if (defined _USE_KNETFILE && !defined KNETFILE_HOOKS) fstat(knet_fileno(in->fp->x.fpr), &s); end = s.st_size - 28; while (knet_tell(in->fp->x.fpr) < end) { @@ -37,6 +37,7 @@ int bcf_cat(int n, char * const *fn) fwrite(buf, 1, size, out->fp->x.fpw); } #else + fprintf(stderr, "Sorry, bcftools cat is not implemented unless compiled with _USE_KNETFILE without KNETFILE_HOOKS\n"); abort(); // FIXME: not implemented #endif bcf_close(in); diff --git a/knetfile.c b/knetfile.c index af09146..54fedd7 100644 --- a/knetfile.c +++ b/knetfile.c @@ -44,6 +44,63 @@ #endif #include "knetfile.h" +#include +#include + +#ifndef _WIN32 +#define netread(fd, ptr, len) read(fd, ptr, len) +#define netwrite(fd, ptr, len) write(fd, ptr, len) +#define netclose(fd) close(fd) +#else +#include +#define netread(fd, ptr, len) recv(fd, ptr, len, 0) +#define netwrite(fd, ptr, len) send(fd, ptr, len, 0) +#define netclose(fd) closesocket(fd) +#endif + +// FIXME: currently I/O is unbuffered + +#define KNF_TYPE_LOCAL 1 +#define KNF_TYPE_FTP 2 +#define KNF_TYPE_HTTP 3 + +#ifdef KNETFILE_HOOKS +// Static global function pointers that may be set by knet_init_alt() +// to replace the usual knet functionality with alternate I/O implementation. +static knet_alt_open_f alt_open = NULL; +static knet_alt_dopen_f alt_dopen = NULL; +static knet_alt_read_f alt_read = NULL; +static knet_alt_seek_f alt_seek = NULL; +static knet_alt_tell_f alt_tell = NULL; +static knet_alt_close_f alt_close = NULL; + +void knet_init_alt(knet_alt_open_f open, knet_alt_dopen_f dopen, knet_alt_read_f read, + knet_alt_seek_f seek, knet_alt_tell_f tell, knet_alt_close_f close) +{ + alt_open = open; + alt_dopen = dopen; + alt_read = read; + alt_seek = seek; + alt_tell = tell; + alt_close = close; +} +#endif + +struct knetFile_s { + int type, fd; + int64_t offset; + char *host, *port; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; // for lazy seek + int64_t file_size; + + // the following are for HTTP only + char *path, *http_host; +}; // typedef'd to knetFile in knetfile.h + /* In winsock.h, the type of a socket is SOCKET, which is: "typedef * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed @@ -85,6 +142,7 @@ static int socket_wait(int fd, int is_read) * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ static int socket_connect(const char *host, const char *port) { +#define __err_connect_no_res(func) do { perror(func); return -1; } while (0) #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) int on = 1, fd; @@ -95,7 +153,7 @@ static int socket_connect(const char *host, const char *port) hints.ai_socktype = SOCK_STREAM; /* In Unix/Mac, getaddrinfo() is the most convenient way to get * server information. */ - if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); + if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect_no_res("getaddrinfo"); if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); /* The following two setsockopt() are used by ftplib * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they @@ -450,6 +508,10 @@ int khttp_connect_file(knetFile *fp) knetFile *knet_open(const char *fn, const char *mode) { +#ifdef KNETFILE_HOOKS + if (alt_open) + return alt_open(fn, mode); +#endif knetFile *fp = 0; if (mode[0] != 'r') { fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); @@ -494,6 +556,10 @@ knetFile *knet_open(const char *fn, const char *mode) knetFile *knet_dopen(int fd, const char *mode) { +#ifdef KNETFILE_HOOKS + if (alt_dopen) + return alt_dopen(fd, mode); +#endif knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); fp->type = KNF_TYPE_LOCAL; fp->fd = fd; @@ -502,6 +568,10 @@ knetFile *knet_dopen(int fd, const char *mode) off_t knet_read(knetFile *fp, void *buf, off_t len) { +#ifdef KNETFILE_HOOKS + if (alt_read) + return alt_read(fp, buf, len); +#endif off_t l = 0; if (fp->fd == -1) return 0; if (fp->type == KNF_TYPE_FTP) { @@ -530,6 +600,10 @@ off_t knet_read(knetFile *fp, void *buf, off_t len) off_t knet_seek(knetFile *fp, int64_t off, int whence) { +#ifdef KNETFILE_HOOKS + if (alt_seek) + return alt_seek(fp, off, whence); +#endif if (whence == SEEK_SET && off == fp->offset) return 0; if (fp->type == KNF_TYPE_LOCAL) { /* Be aware that lseek() returns the offset after seeking, @@ -573,8 +647,21 @@ off_t knet_seek(knetFile *fp, int64_t off, int whence) return -1; } +off_t knet_tell(knetFile *fp) +{ +#ifdef KNETFILE_HOOKS + if (alt_tell) + return alt_tell(fp); +#endif + return fp->offset; +} + int knet_close(knetFile *fp) { +#ifdef KNETFILE_HOOKS + if (alt_close) + return alt_close(fp); +#endif if (fp == 0) return 0; if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific if (fp->fd != -1) { diff --git a/knetfile.h b/knetfile.h index 0a0e66f..b822f60 100644 --- a/knetfile.h +++ b/knetfile.h @@ -1,43 +1,20 @@ #ifndef KNETFILE_H #define KNETFILE_H -#include -#include - -#ifndef _WIN32 -#define netread(fd, ptr, len) read(fd, ptr, len) -#define netwrite(fd, ptr, len) write(fd, ptr, len) -#define netclose(fd) close(fd) -#else -#include -#define netread(fd, ptr, len) recv(fd, ptr, len, 0) -#define netwrite(fd, ptr, len) send(fd, ptr, len, 0) -#define netclose(fd) closesocket(fd) -#endif - -// FIXME: currently I/O is unbuffered - -#define KNF_TYPE_LOCAL 1 -#define KNF_TYPE_FTP 2 -#define KNF_TYPE_HTTP 3 - -typedef struct knetFile_s { - int type, fd; - int64_t offset; - char *host, *port; - - // the following are for FTP only - int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; - char *response, *retr, *size_cmd; - int64_t seek_offset; // for lazy seek - int64_t file_size; - - // the following are for HTTP only - char *path, *http_host; -} knetFile; - -#define knet_tell(fp) ((fp)->offset) +typedef struct knetFile_s knetFile; + +#ifdef KNETFILE_HOOKS +// the following allow knetfile to wrap an alternate I/O library +typedef knetFile *(*knet_alt_open_f)(const char *fn, const char *mode); +typedef knetFile *(*knet_alt_dopen_f)(int fd, const char *mode); +typedef off_t (*knet_alt_read_f)(knetFile *fp, void *buf, off_t len); +typedef off_t (*knet_alt_seek_f)(knetFile *fp, int64_t off, int whence); +typedef off_t (*knet_alt_tell_f)(knetFile *fp); +typedef int (*knet_alt_close_f)(knetFile *fp); +#else +// As of 2/18/2010 this is not used anywhere in samtools, and would not play well with abstraction: #define knet_fileno(fp) ((fp)->fd) +#endif #ifdef __cplusplus extern "C" { @@ -50,6 +27,11 @@ extern "C" { knetFile *knet_open(const char *fn, const char *mode); +#ifdef KNETFILE_HOOKS + void knet_init_alt(knet_alt_open_f open, knet_alt_dopen_f dopen, knet_alt_read_f read, + knet_alt_seek_f seek, knet_alt_tell_f tell, knet_alt_close_f close); +#endif + /* This only works with local files. */ @@ -66,6 +48,7 @@ extern "C" { communicate with the FTP server. */ off_t knet_seek(knetFile *fp, int64_t off, int whence); + off_t knet_tell(knetFile *fp); int knet_close(knetFile *fp); #ifdef __cplusplus