Go to:
Gentoo Home
Documentation
Forums
Lists
Bugs
Planet
Store
Wiki
Get Gentoo!
Gentoo's Bugzilla – Attachment 198458 Details for
Bug 278307
=sci-biology/wise-2.2.0 fails to build with glibc 2.10+
Home
|
New
–
[Ex]
|
Browse
|
Search
|
Privacy Policy
|
[?]
|
Reports
|
Requests
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
sqio.c source file with altered getline() function name
sqio.c (text/plain), 44.93 KB, created by
Robert Bradbury
on 2009-07-19 10:32:07 UTC
(
hide
)
Description:
sqio.c source file with altered getline() function name
Filename:
MIME Type:
Creator:
Robert Bradbury
Created:
2009-07-19 10:32:07 UTC
Size:
44.93 KB
patch
obsolete
>/* SQUID - A C function library for biological sequence analysis > * Copyright (C) 1992-1996 Sean R. Eddy > * > * This source code is distributed under terms of the > * GNU General Public License. See the files COPYING > * and GNULICENSE for further details. > * > */ > >/* File: sqio.c > * From: ureadseq.c in Don Gilbert's sequence i/o package > * > * Reads and writes nucleic/protein sequence in various > * formats. Data files may have multiple sequences. > * > * Heavily modified from READSEQ package > * Copyright (C) 1990 by D.G. Gilbert > * Biology Dept., Indiana University, Bloomington, IN 47405 > * email: gilbertd@bio.indiana.edu > * Thanks Don! > * > * SRE: Modifications as noted. Fri Jul 3 09:44:54 1992 > * Packaged for squid, Thu Oct 1 10:07:11 1992 > * ANSI conversion in full swing, Mon Jul 12 12:22:21 1993 > */ > >#include <stdio.h> >#include <stdlib.h> >#include <string.h> >#include <ctype.h> > >#ifndef SEEK_SET >#include <unistd.h> /* may SunOS rot in hell */ >#endif > >#include "squid.h" > >#ifdef MEMDEBUG >#include "dbmalloc.h" >#endif > >#define kStartLength 500 > >static char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*"; >static char *primenuc = "ACGTUN"; >static char *protonly = "EFIPQZ"; >/* static char stdsymbols[6] = "_.-*?"; */ >static char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'|`~\"\\"; >static char *seqsymbols = allsymbols; >/* > use general form of isseqchar -- all chars + symbols. > no formats except nbrf (?) use symbols in data area as > anything other than sequence chars. > (wrong. PIR-CODATA does. Remove /) >*/ > > >void >FreeSequence(char *seq, SQINFO *sqinfo) >{ > if (seq != NULL) free(seq); > if (sqinfo->flags & SQINFO_SS) free(sqinfo->ss); > if (sqinfo->flags & SQINFO_SA) free(sqinfo->sa); >} > >int >SetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag) >{ > int len; > int pos; > > while (*sptr == ' ') sptr++; /* ignore leading whitespace */ > for (pos = strlen(sptr)-1; pos >= 0; pos--) > if (! isspace(sptr[pos])) break; > sptr[pos+1] = '\0'; /* ignore trailing whitespace */ > > switch (flag) { > case SQINFO_NAME: > if (*sptr != '-') > { > strncpy(sqinfo->name, sptr, SQINFO_NAMELEN-1); > sqinfo->name[SQINFO_NAMELEN-1] = '\0'; > sqinfo->flags |= SQINFO_NAME; > } > break; > > case SQINFO_ID: > if (*sptr != '-') > { > strncpy(sqinfo->id, sptr, SQINFO_NAMELEN-1); > sqinfo->id[SQINFO_NAMELEN-1] = '\0'; > sqinfo->flags |= SQINFO_ID; > } > break; > > case SQINFO_ACC: > if (*sptr != '-') > { > strncpy(sqinfo->acc, sptr, SQINFO_NAMELEN-1); > sqinfo->acc[SQINFO_NAMELEN-1] = '\0'; > sqinfo->flags |= SQINFO_ACC; > } > break; > > case SQINFO_DESC: > if (*sptr != '-') > { > if (sqinfo->flags & SQINFO_DESC) /* append? */ > { > len = strlen(sqinfo->desc); > if (len < SQINFO_DESCLEN-2) /* is there room? */ > { > strncat(sqinfo->desc, " ", SQINFO_DESCLEN-1-len); len++; > strncat(sqinfo->desc, sptr, SQINFO_DESCLEN-1-len); > } > } > else /* else copy */ > strncpy(sqinfo->desc, sptr, SQINFO_DESCLEN-1); > sqinfo->desc[SQINFO_DESCLEN-1] = '\0'; > sqinfo->flags |= SQINFO_DESC; > } > break; > > case SQINFO_START: > if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } > sqinfo->start = atoi(sptr); > if (sqinfo->start != 0) sqinfo->flags |= SQINFO_START; > break; > > case SQINFO_STOP: > if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } > sqinfo->stop = atoi(sptr); > if (sqinfo->stop != 0) sqinfo->flags |= SQINFO_STOP; > break; > > case SQINFO_OLEN: > if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } > sqinfo->olen = atoi(sptr); > if (sqinfo->olen != 0) sqinfo->flags |= SQINFO_OLEN; > break; > > default: > Die("Invalid flag %d to SetSeqinfoString()", flag); > } > return 1; >} > >void >SeqinfoCopy(SQINFO *sq1, SQINFO *sq2) >{ > sq1->flags = sq2->flags; > if (sq2->flags & SQINFO_NAME) strcpy(sq1->name, sq2->name); > if (sq2->flags & SQINFO_ID) strcpy(sq1->id, sq2->id); > if (sq2->flags & SQINFO_ACC) strcpy(sq1->acc, sq2->acc); > if (sq2->flags & SQINFO_DESC) strcpy(sq1->desc, sq2->desc); > if (sq2->flags & SQINFO_LEN) sq1->len = sq2->len; > if (sq2->flags & SQINFO_START) sq1->start = sq2->start; > if (sq2->flags & SQINFO_STOP) sq1->stop = sq2->stop; > if (sq2->flags & SQINFO_OLEN) sq1->olen = sq2->olen; > if (sq2->flags & SQINFO_TYPE) sq1->type = sq2->type; > if (sq2->flags & SQINFO_SS) sq1->ss = Strdup(sq2->ss); > if (sq2->flags & SQINFO_SA) sq1->sa = Strdup(sq2->sa); >} > >/* Function: ToDNA() > * > * Purpose: Convert a sequence to DNA. > * U --> T > */ >void >ToDNA(char *seq) >{ > for (; *seq != '\0'; seq++) > { > if (*seq == 'U') *seq = 'T'; > else if (*seq == 'u') *seq = 't'; > } >} > >/* Function: ToRNA() > * > * Purpose: Convert a sequence to RNA. > * T --> U > */ >void >ToRNA(char *seq) >{ > for (; *seq != '\0'; seq++) > { > if (*seq == 'T') *seq = 'U'; > else if (*seq == 't') *seq = 'u'; > } >} > > >static int >isSeqChar(int c) >{ > if (c > 127) return 0; /* IRIX 4.0 bug! isascii(255) returns TRUE */ > return (isalpha(c) || strchr(seqsymbols,c)); >} > >static void >readline(FILE *f, char *s) >{ > char *cp; > > if (NULL == fgets(s, LINEBUFLEN, f)) > *s = 0; > else { > cp = strchr(s, '\n'); > if (cp != NULL) *cp = 0; > } >} > >/* Function: getsqline() > * Date: SRE, Tue Mar 3 08:30:01 1998 [St. Louis] > * > * Purpose: read a line from a sequence file into V->sbuffer. > * If the fgets() is NULL, V->sbuffer is NULL. > * Trailing \n is chopped. > * If a trailing \n is not there, raise the > * lastlinelong flag in V; either we're at EOF or > * we have a very long line, over our fgets() buffer > * length. > * > * Args: V > * > * Returns: (void) > */ >static void >getsqline(struct ReadSeqVars *V) >{ > char *cp; > > if (fgets(V->sbuffer, LINEBUFLEN, V->f) == NULL) > *(V->sbuffer) = '\0'; > else { > cp = strchr(V->sbuffer, '\n'); > if (cp != NULL) { *cp = '\0'; V->longline = FALSE; } > else V->longline = TRUE; > } >} > > >/* Function: addseq() > * > * Purpose: Add a line of sequence to the growing string in V. > */ >static void >addseq(char *s, struct ReadSeqVars *V) >{ > while (*s != 0) { > if (isSeqChar((int) *s)) { > if (*s == '-' && V->dash_equals_n) *s = 'N'; > if (V->seqlen >= V->maxseq) { > V->maxseq += kStartLength; > V->seq = (char*) ReallocOrDie (V->seq, V->maxseq+1); > } > V->seq[(V->seqlen)++] = *s; > } > s++; > } >} > >static void >addstruc(char *s, struct ReadSeqVars *V) >{ > char *sptr; > > if (! (V->sqinfo->flags & SQINFO_SS)) > { > V->sqinfo->ss = (char *) MallocOrDie ((V->maxseq+1) * sizeof(char)); > V->sqinfo->flags |= SQINFO_SS; > sptr = V->sqinfo->ss; > } > else > { > V->sqinfo->ss = (char *) ReallocOrDie (V->sqinfo->ss, (V->maxseq+1) * sizeof(char)); > sptr = V->sqinfo->ss; > while (*sptr != '\0') sptr++; > } > > while (*s != 0) > { > if (isSeqChar((int)*s)) { *sptr = *s; sptr++; } > s++; > } > *sptr = '\0'; >} > > >static void >readLoop(int addfirst, int (*endTest)(char *,int *), struct ReadSeqVars *V) >{ > int addend = 0; > int done = 0; > > V->seqlen = 0; > if (addfirst) addseq(V->sbuffer, V); > do { > getsqline(V); > /* feof() alone is a bug; files not necessarily \n terminated */ > if (*(V->sbuffer) == '\0' && feof(V->f)) > done = TRUE; > done |= (*endTest)(V->sbuffer, &addend); > if (addend || !done) > addseq(V->sbuffer, V); > } while (!done); >} > > >static int >endPIR(char *s, int *addend) >{ > *addend = 0; > if ((strncmp(s, "///", 3) == 0) || > (strncmp(s, "ENTRY", 5) == 0)) > return 1; > else > return 0; >} > >static void >readPIR(struct ReadSeqVars *V) >{ > char *sptr; > /* load first line of entry */ > while (!feof(V->f) && strncmp(V->sbuffer, "ENTRY", 5) != 0) > getsqline(V); > if (feof(V->f)) return; > > if ((sptr = strtok(V->sbuffer + 15, "\n\t ")) != NULL) > { > SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); > SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); > } > do { > getsqline(V); > if (!feof(V->f) && strncmp(V->sbuffer, "TITLE", 5) == 0) > SetSeqinfoString(V->sqinfo, V->sbuffer+15, SQINFO_DESC); > else if (!feof(V->f) && strncmp(V->sbuffer, "ACCESSION", 9) == 0) > { > if ((sptr = strtok(V->sbuffer+15, " \t\n")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); > } > } while (! feof(V->f) && (strncmp(V->sbuffer,"SEQUENCE", 8) != 0)); > getsqline(V); /* skip next line, coords */ > > readLoop(0, endPIR, V); > > /* reading a real PIR-CODATA database file, we keep the source coords > */ > V->sqinfo->start = 1; > V->sqinfo->stop = V->seqlen; > V->sqinfo->olen = V->seqlen; > V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; > > /* get next line > */ > while (!feof(V->f) && strncmp(V->sbuffer, "ENTRY", 5) != 0) > getsqline(V); >} > > > >static int >endIG(char *s, int *addend) >{ > *addend = 1; /* 1 or 2 occur in line w/ bases */ > return((strchr(s,'1')!=NULL) || (strchr(s,'2')!=NULL)); >} > >static void >readIG(struct ReadSeqVars *V) >{ > char *nm; > /* position past ';' comments */ > do { > getsqline(V); > } while (! (feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer != ';')) )); > > if (!feof(V->f)) > { > if ((nm = strtok(V->sbuffer, "\n\t ")) != NULL) > SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME); > > readLoop(0, endIG, V); > } > > while (!(feof(V->f) || ((*V->sbuffer != '\0') && (*V->sbuffer == ';')))) > getsqline(V); >} > >static int >endStrider(char *s, int *addend) >{ > *addend = 0; > return (strstr( s, "//") != NULL); >} > >static void >readStrider(struct ReadSeqVars *V) >{ > char *nm; > > while ((!feof(V->f)) && (*V->sbuffer == ';')) > { > if (strncmp(V->sbuffer,"; DNA sequence", 14) == 0) > { > if ((nm = strtok(V->sbuffer+16, ",\n\t ")) != NULL) > SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME); > } > getsqline(V); > } > > if (! feof(V->f)) > readLoop(1, endStrider, V); > > /* load next line > */ > while ((!feof(V->f)) && (*V->sbuffer != ';')) > getsqline(V); >} > > >static int >endGB(char *s, int *addend) >{ > *addend = 0; > return ((strstr(s,"//") != NULL) || (strstr(s,"LOCUS") == s)); >} > >static void >readGenBank(struct ReadSeqVars *V) >{ > char *sptr; > int in_definition; > > while (strncmp(V->sbuffer, "LOCUS", 5) != 0) > getsqline(V); > > if ((sptr = strtok(V->sbuffer+12, "\n\t ")) != NULL) > { > SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); > SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); > } > > in_definition = FALSE; > while (! feof(V->f)) > { > getsqline(V); > if (! feof(V->f) && strstr(V->sbuffer, "DEFINITION") == V->sbuffer) > { > if ((sptr = strtok(V->sbuffer+12, "\n")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); > in_definition = TRUE; > } > else if (! feof(V->f) && strstr(V->sbuffer, "ACCESSION") == V->sbuffer) > { > if ((sptr = strtok(V->sbuffer+12, "\n\t ")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); > in_definition = FALSE; > } > else if (strncmp(V->sbuffer,"ORIGIN", 6) != 0) > { > if (in_definition) > SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC); > } > else > break; > } > > readLoop(0, endGB, V); > > /* reading a real GenBank database file, we keep the source coords > */ > V->sqinfo->start = 1; > V->sqinfo->stop = V->seqlen; > V->sqinfo->olen = V->seqlen; > V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; > > > while (!(feof(V->f) || ((*V->sbuffer!=0) && (strstr(V->sbuffer,"LOCUS") == V->sbuffer)))) > getsqline(V); > /* SRE: V->s now holds "//", so sequential > reads are wedged: fixed Tue Jul 13 1993 */ > while (!feof(V->f) && strstr(V->sbuffer, "LOCUS ") != V->sbuffer) > getsqline(V); >} > >static int >endGCGdata(char *s, int *addend) >{ > *addend = 0; > return (*s == '>'); >} > >static void >readGCGdata(struct ReadSeqVars *V) >{ > int binary = FALSE; /* whether data are binary or not */ > int blen; /* length of binary sequence */ > > /* first line contains ">>>>" followed by name */ > if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->sbuffer, 2) == 0) > { > binary = TRUE; > SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME); > blen = atoi(sqd_parse[2]); > } > else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->sbuffer, 1) == 0) > SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME); > else > Die("bogus GCGdata format? %s", V->sbuffer); > > /* second line contains free text description */ > getsqline(V); > SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC); > > if (binary) { > /* allocate for blen characters +3... (allow for 3 bytes of slop) */ > if (blen >= V->maxseq) { > V->maxseq = blen; > if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL) > Die("malloc failed"); > } > /* read (blen+3)/4 bytes from file */ > if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4)) > Die("fread failed"); > V->seqlen = blen; > /* convert binary code to seq */ > GCGBinaryToSequence(V->seq, blen); > } > else readLoop(0, endGCGdata, V); > > while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>')))) > getsqline(V); >} > >static int >endPearson(char *s, int *addend) >{ > *addend = 0; > return(*s == '>'); >} > >static void >readPearson(struct ReadSeqVars *V) >{ > char *sptr; > > if ((sptr = strtok(V->sbuffer+1, "\n\t ")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); > if ((sptr = strtok(NULL, "\n")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); > /* workaround for long NCBI NR lines */ > while (V->longline && ! feof(V->f)) getsqline(V); > > readLoop(0, endPearson, V); > > while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>')))) > getsqline(V); >} > > >static int >endEMBL(char *s, int *addend) >{ > *addend = 0; > /* Some people (Berlin 5S rRNA database, f'r instance) use > * an extended EMBL format that attaches extra data after > * the sequence -- watch out for that. We use the fact that > * real EMBL sequence lines begin with five spaces. > * > * We can use this as the sole end test because readEMBL() will > * advance to the next ID line before starting to read again. > */ > return (strncmp(s," ",5) != 0); >/* return ((strstr(s,"//") != NULL) || (strstr(s,"ID ") == s)); */ >} > >static void >readEMBL(struct ReadSeqVars *V) >{ > char *sptr; > > /* make sure we have first line */ > while (!feof(V->f) && strncmp(V->sbuffer, "ID ", 4) != 0) > getsqline(V); > > if ((sptr = strtok(V->sbuffer+5, "\n\t ")) != NULL) > { > SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); > SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); > } > > do { > getsqline(V); > if (!feof(V->f) && strstr(V->sbuffer, "AC ") == V->sbuffer) > { > if ((sptr = strtok(V->sbuffer+5, "; \t\n")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); > } > else if (!feof(V->f) && strstr(V->sbuffer, "DE ") == V->sbuffer) > { > if ((sptr = strtok(V->sbuffer+5, "\n")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); > } > } while (! feof(V->f) && strncmp(V->sbuffer,"SQ",2) != 0); > > readLoop(0, endEMBL, V); > > /* reading a real EMBL database file, we keep the source coords > */ > V->sqinfo->start = 1; > V->sqinfo->stop = V->seqlen; > V->sqinfo->olen = V->seqlen; > V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; > > /* load next record's ID line */ > while (!feof(V->f) && strncmp(V->sbuffer, "ID ", 4) != 0) > getsqline(V); >} > > >static int >endZuker(char *s, int *addend) >{ > *addend = 0; > return( *s == '(' ); >} > >static void >readZuker(struct ReadSeqVars *V) >{ > char *sptr; > > getsqline(V); /*s == "seqLen seqid string..."*/ > > if ((sptr = strtok(V->sbuffer+6, " \t\n")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); > > if ((sptr = strtok(NULL, "\n")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); > > readLoop(0, endZuker, V); > > while (!(feof(V->f) | ((*V->sbuffer != '\0') & (*V->sbuffer == '(')))) > getsqline(V); >} > >static void >readUWGCG(struct ReadSeqVars *V) >{ > char *si; > char *sptr; > int done; > > V->seqlen = 0; > > /*writeseq: " %s Length: %d (today) Check: %d ..\n" */ > /*drop above or ".." from id*/ > if ((si = strstr(V->sbuffer," Length: ")) != NULL) *si = 0; > else if ((si = strstr(V->sbuffer,"..")) != NULL) *si = 0; > > if ((sptr = strtok(V->sbuffer, "\n\t ")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); > > do { > done = feof(V->f); > getsqline(V); > if (! done) addseq(V->sbuffer, V); > } while (!done); >} > > >static void >readSquid(struct ReadSeqVars *V) >{ > char *sptr; > int dostruc = FALSE; > > while (strncmp(V->sbuffer, "NAM ", 4) != 0) getsqline(V); > > if ((sptr = strtok(V->sbuffer+4, "\n\t ")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); > > /*CONSTCOND*/ > while (1) > { > getsqline(V); > if (feof(V->f)) {squid_errno = SQERR_FORMAT; return; } > > if (strncmp(V->sbuffer, "SRC ", 4) == 0) > { > if ((sptr = strtok(V->sbuffer+4, " \t\n")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); > if ((sptr = strtok(NULL, " \t\n")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); > if ((sptr = strtok(NULL, ".")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_START); > if ((sptr = strtok(NULL, ".:")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_STOP); > if ((sptr = strtok(NULL, ": \t\n")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_OLEN); > } > else if (strncmp(V->sbuffer, "DES ", 4) == 0) > { > if ((sptr = strtok(V->sbuffer+4, "\n")) != NULL) > SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); > } > else if (strncmp(V->sbuffer,"SEQ", 3) == 0) > break; > } > > if (strstr(V->sbuffer, "+SS") != NULL) dostruc = TRUE; > > V->seqlen = 0; > /*CONSTCOND*/ > while (1) > { > /* sequence line */ > getsqline(V); > if (feof(V->f) || strncmp(V->sbuffer, "++", 2) == 0) > break; > addseq(V->sbuffer, V); > /* structure line */ > if (dostruc) > { > getsqline(V); > if (feof(V->f)) { squid_errno = SQERR_FORMAT; return; } > addstruc(V->sbuffer, V); > } > } > > > while (!feof(V->f) && strncmp(V->sbuffer, "NAM ", 4) != 0) > getsqline(V); >} > > >/* Function: SeqfileOpen() > * > * Purpose : Open a sequence database file and prepare for reading > * sequentially. > * > * Args: filename - name of file to open > * format - format of file > * env - environment variable for path (e.g. BLASTDB) > * > * Returns opened SQFILE ptr, or NULL on failure. > */ >SQFILE * >SeqfileOpen(char *filename, int format, char *env) >{ > SQFILE *dbfp; > > dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE)); > dbfp->format = format; > dbfp->longline = FALSE; > > /* Open our file handle. > * Three possibilities: > * 1. normal file open > * 2. filename = "-"; read from stdin > * 3. filename = "*.gz"; read thru pipe from gzip > * If we're reading from stdin or a pipe, we can't reliably > * back up, so we can't do two-pass parsers like the interleaved alignment > * formats. > */ > if (strcmp(filename, "-") == 0) > { > if (IsInterleavedFormat(format)) > Die("Can't read interleaved alignment formats thru stdin, sorry"); > > dbfp->f = stdin; > dbfp->do_stdin = TRUE; > dbfp->do_gzip = FALSE; > } > else if (Strparse("^.*\\.gz$", filename, 0) == 0) > { > char cmd[256]; > > if (IsInterleavedFormat(format)) > Die("Can't read interleaved alignment formats thru gunzip, sorry"); > > if (strlen(filename) + strlen("gzip -dc ") >= 256) > { squid_errno = SQERR_PARAMETER; return NULL; } > sprintf(cmd, "gzip -dc %s", filename); > if ((dbfp->f = popen(cmd, "r")) == NULL) > { squid_errno = SQERR_NOFILE; return NULL; } /* file (or gzip!) doesn't exist */ > dbfp->do_stdin = FALSE; > dbfp->do_gzip = TRUE; > } > else > { > if ((dbfp->f = fopen(filename, "r")) == NULL && > (dbfp->f = EnvFileOpen(filename, env)) == NULL) > { squid_errno = SQERR_NOFILE; return NULL; } > dbfp->do_stdin = FALSE; > dbfp->do_gzip = FALSE; > } > > /* The hack for sequential access of an interleaved alignment file: > * read the alignment in, we'll copy sequences out one at a time. > */ > dbfp->ali_aseqs = NULL; > if (IsInterleavedFormat(format)) > { > if (! ReadAlignment(filename, format, &(dbfp->ali_aseqs), &(dbfp->ali_ainfo))) > return NULL; > dbfp->ali_curridx = 0; > return dbfp; > } > > /* Load the first line. > */ > getsqline(dbfp); > > return dbfp; >} > >/* Function: SeqfilePosition() > * > * Purpose: Move to a particular offset in a seqfile. > * Will not work on interleaved files (SELEX, MSF). > */ >void >SeqfilePosition(SQFILE *sqfp, long offset) >{ > if (sqfp->do_stdin || sqfp->do_gzip || IsInterleavedFormat(sqfp->format)) > Die("SeqfilePosition() failed: in a nonrewindable data file or stream"); > > fseek(sqfp->f, offset, SEEK_SET); > getsqline(sqfp); >} > > >/* Function: SeqfileRewind() > * > * Purpose: Set a sequence file back to the first sequence. > * How to do this varies with whether the file is an > * unaligned file, or an alignment file for which > * we're hacking sequential access. > */ >void >SeqfileRewind(SQFILE *sqfp) >{ > if (sqfp->do_stdin || sqfp->do_gzip) > Die("SeqfileRewind() failed: in a nonrewindable data file or stream"); > > if (sqfp->ali_aseqs != NULL) sqfp->ali_curridx = 0; > else { > rewind(sqfp->f); > getsqline(sqfp); > } >} > >void >SeqfileClose(SQFILE *sqfp) >{ > /* free static ptrs if we used them */ > if (sqfp->ali_aseqs != NULL) > FreeAlignment(sqfp->ali_aseqs, &(sqfp->ali_ainfo)); > > if (sqfp->do_gzip) pclose(sqfp->f); > else if (! sqfp->do_stdin) fclose(sqfp->f); > free(sqfp); >} > > >/* Function: ReadSeq() > * > * Purpose: Read next sequence from an open database file. > * Return the sequence and associated info. > * > * Args: fp - open sequence database file pointer > * format - format of the file (previously determined > * by call to SeqfileFormat()) > * ret_seq - RETURN: sequence > * sqinfo - RETURN: filled in w/ other information > * > * Return: 1 on success, 0 on failure. > * ret_seq and some field of sqinfo are allocated here, > * The preferred call mechanism to properly free the memory is: > * > * SQINFO sqinfo; > * char *seq; > * > * ReadSeq(fp, format, &seq, &sqinfo); > * ... do something... > * FreeSequence(seq, &sqinfo); > */ >int >ReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo) >{ > int gotuw; > int apos, rpos; > > squid_errno = SQERR_OK; > if (format < kMinFormat || format > kMaxFormat) > { > squid_errno = SQERR_FORMAT; > *ret_seq = NULL; > return 0; > } > > /* Here's the hack for sequential access of sequences from > * the multiple sequence alignment formats > */ > if (format == kMSF || format == kSelex || format == kClustal) { > if (V->ali_curridx >= V->ali_ainfo.nseq) > return 0; /* out of aseqs */ > > SeqinfoCopy(sqinfo, &(V->ali_ainfo.sqinfo[V->ali_curridx])); > > /* copy and dealign the appropriate aligned seq */ > V->seq = MallocOrDie (sizeof(char) * (V->ali_ainfo.alen+1)); > for (rpos = apos = 0; apos < V->ali_ainfo.alen; apos++) > if (!isgap(V->ali_aseqs[V->ali_curridx][apos])) > V->seq[rpos++] = V->ali_aseqs[V->ali_curridx][apos]; > V->seq[rpos] = '\0'; > V->seqlen = rpos; > V->ali_curridx++; > } > else { > if (feof(V->f)) return 0; > > V->seq = (char*) calloc (kStartLength+1, sizeof(char)); > V->maxseq = kStartLength; > V->seqlen = 0; > V->sqinfo = sqinfo; > V->sqinfo->flags = 0; > V->dash_equals_n = (format == kEMBL) ? TRUE : FALSE; > > switch (format) { > case kIG : readIG(V); break; > case kStrider : readStrider(V); break; > case kGenBank : readGenBank(V); break; > case kPearson : readPearson(V); break; > case kEMBL : readEMBL(V); break; > case kZuker : readZuker(V); break; > case kPIR : readPIR(V); break; > case kSquid : readSquid(V); break; > case kGCGdata : readGCGdata(V); break; > > case kGCG: > do { /* skip leading comments on GCG file */ > gotuw = (strstr(V->sbuffer,"..") != NULL); > if (gotuw) readUWGCG(V); > getsqline(V); > } while (! feof(V->f)); > break; > > case kIdraw: /* SRE: no attempt to read idraw postscript */ > default: > squid_errno = SQERR_FORMAT; > free(V->seq); > return 0; > } > V->seq[V->seqlen] = 0; /* stick a string terminator on it */ > } > > /* Cleanup > */ > sqinfo->len = V->seqlen; > sqinfo->flags |= SQINFO_LEN; > *ret_seq = V->seq; > if (squid_errno == SQERR_OK) return 1; else return 0; >} > >/* Function: SeqfileFormat() > * > * Purpose: Determine format of seqfile, and return it > * through ret_format. From Gilbert's seqFileFormat(). > * > * If filename is "-", we will read from stdin and > * assume that the stream is coming in FASTA format -- > * either unaligned or aligned. > * > * Args: filename - name of sequence file > * ret_format - RETURN: format code for file, see squid.h > * for codes. > * env - name of environment variable containing > * a directory path that filename might also be > * found in. "BLASTDB", for example. Can be NULL. > * > * Return: 1 on success, 0 on failure. > */ >int >SeqfileFormat(char *filename, int *ret_format, char *env) >{ > int foundIG = 0; > int foundStrider = 0; > int foundGB = 0; > int foundEMBL = 0; > int foundPearson = 0; > int foundZuker = 0; > int gotGCGdata = 0; > int gotPIR = 0; > int gotSquid = 0; > int gotuw = 0; > int gotMSF = 0; > int gotClustal = 0; > int done = 0; > int format = kUnknown; > int nlines= 0, dnalines= 0; > int splen = 0; > char sp[LINEBUFLEN]; > FILE *fseq; > > /* First check if filename is "-": special case indicating > * a FASTA pipe. > */ > if (strcmp(filename, "-") == 0) > { *ret_format = kPearson; return 1; } > >#define ReadOneLine(sp) \ > { done |= (feof(fseq)); \ > readline( fseq, sp); \ > if (!done) { splen = (int) strlen(sp); ++nlines; } } > > if ((fseq = fopen(filename, "r")) == NULL && > (fseq = EnvFileOpen(filename, env)) == NULL) > { squid_errno = SQERR_NOFILE; return 0; } > > /* Look at a line at a time > */ > while ( !done ) { > ReadOneLine(sp); > > if (sp==NULL || *sp=='\0') > /*EMPTY*/ ; > > /* high probability identities: */ > > else if (strstr(sp, " MSF:") != NULL && > strstr(sp, " Type:") != NULL && > strstr(sp, " Check:") != NULL) > gotMSF = 1; > > else if (strncmp(sp, "CLUSTAL ", 8) == 0 && > strstr( sp, "multiple sequence alignment")) > gotClustal = 1; > > else if (strstr(sp," Check: ") != NULL) > gotuw= 1; > > else if (strncmp(sp, "///", 3) == 0 || strncmp(sp, "ENTRY ", 6) == 0) > gotPIR = 1; > > else if (strncmp(sp, "++", 2) == 0 || strncmp(sp, "NAM ", 4) == 0) > gotSquid = 1; > > else if (strncmp(sp, ">>>>", 4) == 0 && strstr(sp, "Len: ")) > gotGCGdata = 1; > > /* uncertain identities: */ > > else if (*sp ==';') { > if (strstr(sp,"Strider") !=NULL) foundStrider= 1; > else foundIG= 1; > } > else if (strncmp(sp,"LOCUS",5) == 0 || strncmp(sp,"ORIGIN",5) == 0) > foundGB= 1; > > else if (*sp == '>') { > foundPearson = 1; > } > > else if (strstr(sp,"ID ") == sp || strstr(sp,"SQ ") == sp) > foundEMBL= 1; > > else if (*sp == '(') > foundZuker= 1; > > else { > switch (Seqtype( sp )) { > case kDNA: > case kRNA: if (splen>20) dnalines++; break; > default: break; > } > } > > if (gotMSF) {format = kMSF; done = 1; } > else if (gotClustal) {format = kClustal; done = 1; } > else if (gotSquid) {format = kSquid; done = 1; } > else if (gotPIR) {format = kPIR; done = 1; } > else if (gotGCGdata) {format = kGCGdata; done = 1; } > else if (gotuw) > { > if (foundIG) format= kIG; /* a TOIG file from GCG for certain */ > else format= kGCG; > done= 1; > } > else if ((dnalines > 1) || done || (nlines > 500)) { > /* decide on most likely format */ > /* multichar idents: */ > if (foundStrider) format= kStrider; > else if (foundGB) format= kGenBank; > else if (foundEMBL) format= kEMBL; > /* single char idents: */ > else if (foundIG) format= kIG; > else if (foundPearson) format= kPearson; > else if (foundZuker) format= kZuker; > /* spacing ident: */ > else if (IsSELEXFormat(filename)) format= kSelex; > /* no format chars: */ > else > { > squid_errno = SQERR_FORMAT; > return 0; > } > > done= 1; > } > } > > if (fseq!=NULL) fclose(fseq); > > *ret_format = format; > return 1; >#undef ReadOneLine >} > >/* Function: GCGBinaryToSequence() > * > * Purpose: Convert a GCG 2BIT binary string to DNA sequence. > * 0 = C 1 = T 2 = A 3 = G > * 4 nts/byte > * > * Args: seq - binary sequence. Converted in place to DNA. > * len - length of DNA. binary is (len+3)/4 bytes > */ >int >GCGBinaryToSequence(char *seq, int len) >{ > int bpos; /* position in binary */ > int spos; /* position in sequence */ > char twobit; > int i; > > for (bpos = (len-1)/4; bpos >= 0; bpos--) > { > twobit = seq[bpos]; > spos = bpos*4; > > for (i = 3; i >= 0; i--) > { > switch (twobit & 0x3) { > case 0: seq[spos+i] = 'C'; break; > case 1: seq[spos+i] = 'T'; break; > case 2: seq[spos+i] = 'A'; break; > case 3: seq[spos+i] = 'G'; break; > } > twobit = twobit >> 2; > } > } > seq[len] = '\0'; > return 1; >} > > >int >GCGchecksum(char *seq, int seqlen) >{ > int check = 0, count = 0, i; > > for (i = 0; i < seqlen; i++) { > count++; > check += count * sre_toupper((int) seq[i]); > if (count == 57) count = 0; > } > return (check % 10000); >} > > >/* Function: GCGMultchecksum() > * > * Purpose: Simple modification of GCGchecksum(), > * to create a checksum for multiple sequences. > * Gaps count. > * > * Args: seqs - sequences to be checksummed > * nseq - number of sequences > * > * Return: the checksum, a number between 0 and 9999 > */ >int >GCGMultchecksum(char **seqs, int nseq) >{ > int check = 0; > int count = 0; > int idx; > char *sptr; > > for (idx = 0; idx < nseq; idx++) > for (sptr = seqs[idx]; *sptr; sptr++) > { > count++; > check += count * sre_toupper((int) *sptr); > if (count == 57) count = 0; > } > return (check % 10000); >} > > > > >/* Function: Seqtype() > * > * Purpose: Returns a (very good) guess about type of sequence: > * kDNA, kRNA, kAmino, or kOtherSeq. > * > * Modified from, and replaces, Gilbert getseqtype(). > */ >int >Seqtype(char *seq) >{ > int saw; /* how many non-gap characters I saw */ > char c; > int po = 0; /* count of protein-only */ > int nt = 0; /* count of t's */ > int nu = 0; /* count of u's */ > int na = 0; /* count of nucleotides */ > int aa = 0; /* count of amino acids */ > int no = 0; /* count of others */ > > /* Look at the first 300 non-gap characters > */ > for (saw = 0; *seq != '\0' && saw < 300; seq++) > { > c = sre_toupper((int) *seq); > if (! isgap(c)) > { > if (strchr(protonly, c)) po++; > else if (strchr(primenuc,c)) { > na++; > if (c == 'T') nt++; > else if (c == 'U') nu++; > } > else if (strchr(aminos,c)) aa++; > else if (isalpha(c)) no++; > saw++; > } > } > > if (no > 0) return kOtherSeq; > else if (po > 0) return kAmino; > else if (na > aa) { > if (nu > nt) return kRNA; > else return kDNA; > } > else return kAmino; >} > >int >WriteSeq(FILE *outf, int outform, char *seq, SQINFO *sqinfo) >{ > int numline = 0; > int lines = 0, spacer = 0, width = 50, tab = 0; > int i, j, l, l1, ibase; > char endstr[10]; > char s[100]; /* buffer for sequence */ > char ss[100]; /* buffer for structure */ > int checksum = 0; > int seqlen; > int which_case; /* 0 = do nothing. 1 = upper case. 2 = lower case */ > int dostruc; /* TRUE to print structure lines*/ > > which_case = 0; > dostruc = FALSE; > seqlen = (sqinfo->flags & SQINFO_LEN) ? sqinfo->len : strlen(seq); > > /* intercept Selex-format requests - SRE */ > if (outform == kSelex) { > fprintf(outf, "%10s %s\n", sqinfo->name, seq); > return 1; > } > > if (outform == kClustal || outform == kMSF) { > Warn("Tried to write Clustal or MSF with WriteSeq() -- bad, bad."); > return 1; > } > > strcpy( endstr,""); > l1 = 0; > > /* 10Nov91: write this out in all possible formats: */ > checksum = GCGchecksum(seq, seqlen); > > switch (outform) { > > case kUnknown: /* no header, just sequence */ > strcpy(endstr,"\n"); /* end w/ extra blank line */ > break; > > case kGenBank: > fprintf(outf,"LOCUS %s %d bp\n", > (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name, > seqlen); > fprintf(outf,"DEFINITION %s\n", > (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); > fprintf(outf,"ACCESSION %s\n", > (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); > fprintf(outf,"ORIGIN \n"); > spacer = 11; > numline = 1; > strcpy(endstr, "\n//"); > break; > > case kGCGdata: > fprintf(outf, ">>>>%s 9/95 ASCII Len: %d\n", sqinfo->name, seqlen); > fprintf(outf, "%s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); > break; > > case kPIR: > fprintf(outf, "ENTRY %s\n", > (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name); > fprintf(outf, "TITLE %s\n", > (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); > fprintf(outf, "ACCESSION %s\n", > (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); > fprintf(outf, "SUMMARY #Length %d #Checksum %d\n", > sqinfo->len, checksum); > fprintf(outf, "SEQUENCE\n"); > fprintf(outf, " 5 10 15 20 25 30\n"); > spacer = 2; /* spaces after every residue */ > numline = 1; /* number lines w/ coords */ > width = 30; /* 30 aa per line */ > strcpy(endstr, "\n///"); > break; > > case kSquid: > fprintf(outf, "NAM %s\n", sqinfo->name); > if (sqinfo->flags & (SQINFO_ID | SQINFO_ACC | SQINFO_START | SQINFO_STOP | SQINFO_OLEN)) > fprintf(outf, "SRC %s %s %d..%d::%d\n", > (sqinfo->flags & SQINFO_ID) ? sqinfo->id : "-", > (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-", > (sqinfo->flags & SQINFO_START) ? sqinfo->start : 0, > (sqinfo->flags & SQINFO_STOP) ? sqinfo->stop : 0, > (sqinfo->flags & SQINFO_OLEN) ? sqinfo->olen : 0); > if (sqinfo->flags & SQINFO_DESC) > fprintf(outf, "DES %s\n", sqinfo->desc); > if (sqinfo->flags & SQINFO_SS) > { > fprintf(outf, "SEQ +SS\n"); > dostruc = TRUE; /* print structure lines too */ > } > else > fprintf(outf, "SEQ\n"); > numline = 1; /* number seq lines w/ coords */ > strcpy(endstr, "\n++"); > break; > > case kEMBL: > fprintf(outf,"ID %s\n", > (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name); > fprintf(outf,"AC %s\n", > (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); > fprintf(outf,"DE %s\n", > (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); > fprintf(outf,"SQ %d BP\n", seqlen); > strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/ > tab = 5; /** added 31jan91 */ > spacer = 11; /** added 31jan91 */ > break; > > case kGCG: > fprintf(outf,"%s\n", sqinfo->name); > if (sqinfo->flags & SQINFO_ACC) > fprintf(outf,"ACCESSION %s\n", sqinfo->acc); > if (sqinfo->flags & SQINFO_DESC) > fprintf(outf,"DEFINITION %s\n", sqinfo->desc); > fprintf(outf," %s Length: %d (today) Check: %d ..\n", > sqinfo->name, seqlen, checksum); > spacer = 11; > numline = 1; > strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */ > break; > > case kStrider: /* ?? map ?*/ > fprintf(outf,"; ### from DNA Strider ;-)\n"); > fprintf(outf,"; DNA sequence %s, %d bases, %d checksum.\n;\n", > sqinfo->name, seqlen, checksum); > strcpy(endstr, "\n//"); > break; > > /* SRE: Don had Zuker default to Pearson, which is not > intuitive or helpful, since Zuker's MFOLD can't read > Pearson format. More useful to use kIG */ > case kZuker: > which_case = 1; /* MFOLD requires upper case. */ > /*FALLTHRU*/ > case kIG: > fprintf(outf,";%s %s\n", > sqinfo->name, > (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ""); > fprintf(outf,"%s\n", sqinfo->name); > strcpy(endstr,"1"); /* == linear dna */ > break; > > case kRaw: /* Raw: just print the whole sequence. */ > fprintf(outf, "%s\n", seq); > return 1; > > default : > case kPearson: > fprintf(outf,">%s %s\n", sqinfo->name, > (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ""); > break; > } > > if (which_case == 1) s2upper(seq); > if (which_case == 2) s2lower(seq); > > > width = MIN(width,100); > for (i=0, l=0, ibase = 1, lines = 0; i < seqlen; ) { > if (l1 < 0) l1 = 0; > else if (l1 == 0) { > if (numline) fprintf(outf,"%8d ",ibase); > for (j=0; j<tab; j++) fputc(' ',outf); > } > if ((spacer != 0) && ((l+1) % spacer == 1)) > { s[l] = ' '; ss[l] = ' '; l++; } > s[l] = seq[i]; > ss[l] = (sqinfo->flags & SQINFO_SS) ? sqinfo->ss[i] : '.'; > l++; i++; > l1++; /* don't count spaces for width*/ > if (l1 == width || i == seqlen) { > s[l] = ss[l] = '\0'; > l = 0; l1 = 0; > if (dostruc) > { > fprintf(outf, "%s\n", s); > if (numline) fprintf(outf," "); > for (j=0; j<tab; j++) fputc(' ',outf); > if (i == seqlen) fprintf(outf,"%s%s\n",ss,endstr); > else fprintf(outf,"%s\n",ss); > } > else > { > if (i == seqlen) fprintf(outf,"%s%s\n",s,endstr); > else fprintf(outf,"%s\n",s); > } > lines++; > ibase = i+1; > } > } > return lines; >} > > >/* Function: ReadMultipleRseqs() > * > * Purpose: Open a data file and > * parse it into an array of rseqs (raw, unaligned > * sequences). > * > * Caller is responsible for free'ing memory allocated > * to ret_rseqs, ret_weights, and ret_names. > * > * Weights are currently only supported for MSF format. > * Sequences read from all other formats will be assigned > * weights of 1.0. If the caller isn't interested in > * weights, it passes NULL as ret_weights. > * > * Returns 1 on success. Returns 0 on failure and sets > * squid_errno to indicate the cause. > */ >int >ReadMultipleRseqs(char *seqfile, > int fformat, > char ***ret_rseqs, > SQINFO **ret_sqinfo, > int *ret_num) >{ > SQINFO *sqinfo; /* array of sequence optional info */ > SQFILE *dbfp; /* open ptr for sequential access of file */ > char **rseqs; /* sequence array */ > char **aseqs; /* aligned sequences, if file is aligned */ > AINFO ainfo; /* alignment-associated information */ > int numalloced; /* num of seqs currently alloced for */ > int idx; > int num; > > if (fformat == kSelex || fformat == kMSF || fformat == kClustal) > { > if (! ReadAlignment(seqfile, fformat, &aseqs, &ainfo)) return 0; > if (! DealignAseqs(aseqs, ainfo.nseq, &rseqs)) return 0; > > /* copy the sqinfo array > */ > num = ainfo.nseq; > sqinfo= (SQINFO *) MallocOrDie (sizeof(SQINFO)*ainfo.nseq); > for (idx = 0; idx < ainfo.nseq; idx++) > SeqinfoCopy(&(sqinfo[idx]), &(ainfo.sqinfo[idx])); > FreeAlignment(aseqs, &ainfo); > } > else > { > /* initial alloc */ > num = 0; > numalloced = 16; > rseqs = (char **) MallocOrDie (numalloced * sizeof(char *)); > sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO)); > if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0; > > while (ReadSeq(dbfp, fformat, &rseqs[num], &(sqinfo[num]))) > { > num++; > if (num == numalloced) /* more seqs coming, alloc more room */ > { > numalloced += 16; > rseqs = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *)); > sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO)); > } > } > SeqfileClose(dbfp); > } > > *ret_rseqs = rseqs; > *ret_sqinfo = sqinfo; > *ret_num = num; > return 1; >} > > > > >char * >SeqFormatString(int code) >{ > switch (code) { > case kUnknown: return "Unknown"; > case kIG: return "Intelligenetics"; > case kGenBank: return "GenBank"; > case kEMBL: return "EMBL"; > case kGCG: return "GCG"; > case kGCGdata: return "GCG datalibrary"; > case kStrider: return "Strider"; > case kPearson: return "FASTA"; > case kZuker: return "Zuker"; > case kIdraw: return "Idraw"; > case kSelex: return "SELEX"; > case kMSF: return "MSF"; > case kClustal: return "Clustal"; > case kPIR: return "PIR-CODATA"; > case kRaw: return "raw"; > case kSquid: return "Squid"; > default: return "(bad code)"; > } >} > > > >/* Function: GSIOpen() > * > * Purpose: Open a GSI file. Returns the number of records in > * the file and a file pointer. Returns NULL on failure. > * The file pointer should be fclose()'d normally. > */ >GSIFILE * >GSIOpen(char *gsifile) >{ > GSIFILE *gsi; > char magic[GSI_KEYSIZE]; > > gsi = (GSIFILE *) MallocOrDie (sizeof(GSIFILE)); > if ((gsi->gsifp = fopen(gsifile, "r")) == NULL) > { squid_errno = SQERR_NOFILE; return NULL; } > > if (! fread(magic, sizeof(char), GSI_KEYSIZE, gsi->gsifp)) > { squid_errno = SQERR_NODATA; return NULL; } > if (strcmp(magic, "GSI") != 0) > { squid_errno = SQERR_FORMAT; return NULL; } > > if (! fread(&(gsi->nfiles), sizeof(sqd_uint16), 1, gsi->gsifp)) > { squid_errno = SQERR_NODATA; return NULL; } > if (! fread(&(gsi->recnum), sizeof(sqd_uint32), 1, gsi->gsifp)) > { squid_errno = SQERR_NODATA; return NULL; } > > gsi->nfiles = sre_ntohs(gsi->nfiles); /* convert from network short */ > gsi->recnum = sre_ntohl(gsi->recnum); /* convert from network long */ > > return gsi; >} > >/* Function: GSIGetRecord() > * > * Purpose: Each non-header record of a GSI index files consists > * of 38 bytes: 32 bytes of character string, a 2 byte > * short, and a 4 byte long. This function returns the > * three values. > * > * Args: gsi - open GSI index file, correctly positioned at a record > * f1 - char[32], allocated by caller (or NULL if unwanted) > * f2 - pointer to short (or NULL if unwanted) > * f3 - pointer to long (or NULL if unwanted) > * > * Return: 0 on failure and sets squid_errno. > */ >static int >GSIGetRecord(GSIFILE *gsi, char *f1, sqd_uint16 *f2, sqd_uint32 *f3) >{ > if (f1 == NULL) fseek(gsi->gsifp, GSI_KEYSIZE, SEEK_CUR); > else if (! fread(f1, GSI_KEYSIZE, 1, gsi->gsifp)) > { squid_errno = SQERR_NODATA; return 0; } > > if (f2 == NULL) fseek(gsi->gsifp, sizeof(sqd_uint16), SEEK_CUR); > else if (! fread(f2, sizeof(sqd_uint16), 1, gsi->gsifp)) > { squid_errno = SQERR_NODATA; return 0; } > > if (f3 == NULL) fseek(gsi->gsifp, sizeof(sqd_uint32), SEEK_CUR); > else if (! fread(f3, sizeof(sqd_uint32), 1, gsi->gsifp)) > { squid_errno = SQERR_NODATA; return 0; } > > if (f2 != NULL) *f2 = sre_ntohs(*f2); > if (f3 != NULL) *f3 = sre_ntohl(*f3); > > return 1; >} > > >/* Function: GSIGetOffset() > * > * Purpose: From a key (sequence name), find a disk offset > * in an open general sequence index file by binary > * search. Presumably GSI indexing could be even faster > * if we used hashing. > * > * Args: gsi - GSI index file, opened by GSIOpen() > * key - name of key to retrieve indices for > * ret_seqfile - pre-alloced char[32] array for seqfile name > * ret_fmt - format of seqfile > * ret_offset - return: disk offset in seqfile. > */ >int >GSIGetOffset(GSIFILE *gsi, char *key, char *ret_seqfile, > int *ret_format, long *ret_offset) >{ > sqd_uint32 left, right, mid; > int cmp; > char name[GSI_KEYSIZE + 1]; > sqd_uint32 offset; > sqd_uint16 filenum; > sqd_uint32 fmt; > > name[GSI_KEYSIZE] = '\0'; > > left = gsi->nfiles + 1; > right = gsi->nfiles + gsi->recnum; > mid = (left + right) / 2; > fseek(gsi->gsifp, mid * GSI_RECSIZE, SEEK_SET); > > while (GSIGetRecord(gsi, name, &filenum, &offset)) > { > cmp = strcmp(name, key); > if (cmp == 0) break; /* found it! */ > else if (left >= right) return 0; /* oops, missed it; fail. */ > else if (cmp < 0) left = mid + 1; /* it's right of mid */ > else if (cmp > 0) right = mid - 1; /* it's left of mid */ > mid = (left + right) / 2; > fseek(gsi->gsifp, mid * GSI_RECSIZE, SEEK_SET); > } > > /* Using file number, look up the sequence file and format. > */ > fseek(gsi->gsifp, filenum * GSI_RECSIZE, SEEK_SET); > GSIGetRecord(gsi, ret_seqfile, NULL, &fmt); > *ret_format = (int) fmt; > *ret_offset = (long) offset; > > return 1; >} > >/* Function: GSIClose() > * > * Purpose: Close an open GSI sequence index file. > */ >void >GSIClose(GSIFILE *gsi) >{ > fclose(gsi->gsifp); > free(gsi); >} > >
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Raw
Actions:
View
Attachments on
bug 278307
: 198458