static char rcsid[] = "$Id: regiondb-write.c 224335 2021-06-21 18:52:43Z twu $";
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#ifndef HAVE_MEMCPY
# define memcpy(d,s,n) bcopy((s),(d),(n))
#endif
#ifndef HAVE_MEMMOVE
# define memmove(d,s,n) bcopy((s),(d),(n))
#endif

#include "regiondb-write.h"
#include "regiondbdef.h"	/* For REGION_LENGTH */

#ifdef WORDS_BIGENDIAN
#include "bigendian.h"
#else
#include "littleendian.h"
#endif

#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>		/* For memset */
#include <ctype.h>		/* For toupper */
#include <sys/mman.h>		/* For munmap */

#ifdef HAVE_UNISTD_H
#include <unistd.h>		/* For lseek and close */
#endif
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>		/* For off_t */
#endif
#if HAVE_DIRENT_H
# include <dirent.h>
# define NAMLEN(dirent) strlen((dirent)->d_name)
#else
# define dirent direct
# define NAMLEN(dirent) (dirent)->d_namlen
# if HAVE_SYS_NDIR_H
#  include <sys/ndir.h>
# endif
# if HAVE_SYS_DIR_H
#  include <sys/dir.h>
# endif
# if HAVE_NDIR_H
#  include <ndir.h>
# endif
#endif

#include "assert.h"
#include "mem.h"
#include "fopen.h"
#include "types.h"		/* For Oligospace_T */
#include "filesuffix.h"

#include "compress-write.h"	/* For Compress_get_char */
#include "complement.h"

#include "uintlist.h"


/* Regiondb_cat */
#ifdef DEBUG1
#define debug1(x) x
#else
#define debug1(x)
#endif


/* Another MONITOR_INTERVAL is in compress.c */
#define MONITOR_INTERVAL 100000000 /* 100 million nt */


static Oligospace_T
power (int base, int exponent) {
  Oligospace_T result = 1;
  int i;

  for (i = 0; i < exponent; i++) {
    result *= base;
  }
  return result;
}


/* Want to call with part 6, interval 1*/
void
Regiondb_write (char *destdir, char interval_char, FILE *sequence_fp, Univ_IIT_T chromosome_iit,
#ifdef PMAP
		Alphabet_T alphabet, Width_T region1part_aa, bool watsonp,
#else
		Width_T region1part,
#endif
		Width_T region1interval, bool genome_lc_p, char *fileroot, bool mask_lowercase_p,
		Reduce_fcn_T reduce_fcn) {
  char *uppercaseCode;
  FILE *region_fp;
  char *regionfile;
  char *filesuffix;

  char *comma;
  int c, nchrs, chrnum;
  Oligospace_T oligospace;
  Univcoord_T position = 0, adjposition, next_chrbound, total_genomelength;
  Chrpos_T chrpos = 0U;
#ifdef HAVE_64_BIT
  Oligospace_T oligo = 0ULL;
#else
  Shortoligomer_T high = 0U, low = 0U, carry;
#endif

  UINT2 *offsets;
  Uintlist_T *positions, p;
  UINT2 position_block[REGION_LENGTH];
  UINT2 i, k;


#ifdef PMAP
  int alphabet_size;
  int frame = -1, between_counter[3], in_counter[3];
  Shortoligomer_T aaindex;
  int region1part_nt = 3*region1part_aa;
#else
  Oligospace_T masked, mask;
  int between_counter, in_counter;
#endif
#ifdef DEBUG1
  char *aa;
#endif

  int circular_typeint;


#ifdef PMAP
  if (watsonp == true) {
    filesuffix = FWD_FILESUFFIX;
  } else {
    filesuffix = REV_FILESUFFIX;
  }
#else
  filesuffix = IDX_FILESUFFIX;
#endif

  regionfile = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
			       strlen(".")+strlen(filesuffix)+
			       /*for kmer*/2+/*for interval char*/1+
			       strlen("regiondb")+1,sizeof(char));
#ifdef PMAP
  sprintf(regionfile,"%s/%s.%s.%s%d%c%s",
	  destdir,fileroot,Alphabet_string(alphabet),filesuffix,region1part_aa,interval_char,"regiondb");
#else
  sprintf(regionfile,"%s/%s.%s%02d%c%s",
	  destdir,fileroot,filesuffix,region1part,interval_char,"regiondb");
#endif
	
  if ((region_fp = FOPEN_WRITE_BINARY(regionfile)) == NULL) {
    fprintf(stderr,"Can't write to file %s\n",regionfile);
    exit(9);
  } else {
    FREE(regionfile);
  }


  if (mask_lowercase_p == false) {
    uppercaseCode = UPPERCASE_U2T; /* We are reading DNA sequence */
  } else {
    uppercaseCode = NO_UPPERCASE;
  }

#ifdef PMAP
  alphabet_size = Alphabet_get_size(alphabet);
  oligospace = power(alphabet_size,region1part_aa);
  between_counter[0] = between_counter[1] = between_counter[2] = 0;
  in_counter[0] = in_counter[1] = in_counter[2] = 0;

#else
#ifdef HAVE_64_BIT
  mask = ~(~0ULL << 2*region1part);
#else
  mask = ~(~0U << 2*region1part);
#endif
  oligospace = power(4,region1part); /* 4^6 = 4096 */
  between_counter = in_counter = 0;
#endif

  positions = (Uintlist_T *) MALLOC(oligospace * sizeof(Uintlist_T));
  memset(positions,0,oligospace*sizeof(Uintlist_T));

  offsets = (UINT2 *) MALLOC(oligospace * sizeof(UINT2));
  memset(offsets,0,oligospace*sizeof(UINT2));


  /* Handle reference strain */
  total_genomelength = Univ_IIT_genomelength(chromosome_iit,/*with_circular_alias_p*/true);
  circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular");
  chrnum = 1;
  nchrs = Univ_IIT_total_nintervals(chromosome_iit);
  next_chrbound = Univ_IIT_next_chrbound(chromosome_iit,chrnum,circular_typeint);

  while (position < total_genomelength) {
    c = Compress_get_char(sequence_fp,position,genome_lc_p);

#ifdef PMAP
    if (++frame == 3) {
      frame = 0;
    }
    between_counter[frame] += 1;
    in_counter[frame] += 1;
#else
    between_counter++;
    in_counter++;
#endif

    if (position % MONITOR_INTERVAL == 0) {
      comma = Genomicpos_commafmt(position);
#ifdef PMAP
      fprintf(stderr,"Indexing offsets/positions of oligomers in genome %s (%d aa every %d aa), position %s",
	      fileroot,region1part_aa,region1interval,comma);
#else
      fprintf(stderr,"Indexing offsets/positions of oligomers in genome %s (%d bp every %d bp), position %s",
	      fileroot,region1part,region1interval,comma);
#endif
      FREE(comma);
#ifdef PMAP
      if (watsonp == true) {
	fprintf(stderr," (fwd)");
      } else {
	fprintf(stderr," (rev)");
      }
#endif
      fprintf(stderr,"\n");
    }

#ifdef HAVE_64_BIT
    switch (uppercaseCode[c]) {
    case 'A': oligo = (oligo << 2); break;
    case 'C': oligo = (oligo << 2) | 1U; break;
    case 'G': oligo = (oligo << 2) | 2U; break;
    case 'T': oligo = (oligo << 2) | 3U; break;
    case 'X': case 'N':
      oligo = 0U;
#ifdef PMAP
      in_counter[0] = in_counter[1] = in_counter[2] = 0;
#else
      in_counter = 0;
#endif
      break;
    default: 
      if (genome_lc_p == true) {
	oligo = 0U;
#ifdef PMAP
      in_counter[0] = in_counter[1] = in_counter[2] = 0;
#else
      in_counter = 0;
#endif
      } else {
	fprintf(stderr,"Bad character %c at position %llu\n",c,(unsigned long long) position);
	abort();
      }
    }

#else
    carry = (low >> 30);
    switch (uppercaseCode[c]) {
    case 'A': low = (low << 2); break;
    case 'C': low = (low << 2) | 1U; break;
    case 'G': low = (low << 2) | 2U; break;
    case 'T': low = (low << 2) | 3U; break;
    case 'X': case 'N': 
      high = low = carry = 0U; 
#ifdef PMAP
      in_counter[0] = in_counter[1] = in_counter[2] = 0;
#else
      in_counter = 0;
#endif
      break;
    default: 
      if (genome_lc_p == true) {
	high = low = carry = 0U;
#ifdef PMAP
	in_counter[0] = in_counter[1] = in_counter[2] = 0;
#else
	in_counter = 0;
#endif
      } else {
	fprintf(stderr,"Bad character %c at position %u\n",c,position);
	abort();
      }
    }
    high = (high << 2) | carry; 
#endif

#ifdef PMAP
    debug(printf("frame=%d char=%c bc=%d ic=%d high=%08X low=%08X\n",
		 frame,c,between_counter[frame],in_counter[frame],high,low));

    if (in_counter[frame] > 0) {
      if (watsonp == true) {
#ifdef HAVE_64_BIT
	if (Alphabet_get_codon_fwd(oligo) == AA_STOP) {
	  debug(printf("Resetting in_counter for frame %d to 0\n",frame));
	  in_counter[frame] = 0; 
	}
#else
	if (Alphabet_get_codon_fwd(low) == AA_STOP) {
	  debug(printf("Resetting in_counter for frame %d to 0\n",frame));
	  in_counter[frame] = 0; 
	}
#endif
      } else {
#ifdef HAVE_64_BIT
	if (Alphabet_get_codon_rev(oligo) == AA_STOP) {
	  debug(printf("Resetting in_counter for frame %d to 0\n",frame));
	  in_counter[frame] = 0; 
	}
#else
	if (Alphabet_get_codon_rev(low) == AA_STOP) {
	  debug(printf("Resetting in_counter for frame %d to 0\n",frame));
	  in_counter[frame] = 0; 
	}
#endif
      }
    }
    if (in_counter[frame] == region1part_aa + 1) {
      if (between_counter[frame] >= region1interval) {
#ifdef HAVE_64_BIT
	aaindex = Alphabet_get_aa_index(oligo,watsonp,region1part_nt);
#else
	aaindex = Alphabet_get_aa_index(high,low,watsonp,region1part_nt);
#endif
	offsets[aaindex] += 1;

	between_counter[frame] = 0;
      }
      in_counter[frame] -= 1;
    }

#else
    if (in_counter == region1part) {
      if (
#ifdef NONMODULAR
	  between_counter >= region1interval
#else
	  (chrpos-region1part+1U) % region1interval == 0
#endif
	  ) {
	masked = reduce_fcn(oligo) & mask;
	positions[masked] = Uintlist_push(positions[masked],(position - region1part + 1) % REGION_LENGTH);
	offsets[masked] += 1;

	between_counter = 0;
      }
      in_counter--;
    }
#endif

    chrpos++;			/* Needs to go here, before we reset chrpos to 0 */
    if (position >= next_chrbound) {
#ifndef PMAP
      oligo = 0;
      in_counter = 0;
#elif defined(HAVE_64_BIT)
      oligo = 0ULL;
      in_counter[0] = in_counter[1] = in_counter[2] = 0;
#else
      high = low = carry = 0U;
      in_counter[0] = in_counter[1] = in_counter[2] = 0;
#endif

      chrpos = 0U;
      chrnum++;
      while (chrnum <= nchrs && (next_chrbound = Univ_IIT_next_chrbound(chromosome_iit,chrnum,circular_typeint)) < position) {
	chrnum++;
      }
    }

    position += 1;
    if (position > (Univcoord_T) (region1part - 1) && (adjposition = position - region1part + 1) % REGION_LENGTH == 0) {
      /* Write offsets */
      for (i = 1; i < oligospace; i++) {
	offsets[i] = offsets[i-1] + offsets[i];
      }
      FWRITE_USHORTS(offsets,oligospace,region_fp);
      memset(offsets,0,oligospace*sizeof(UINT2));

      /* Write positions */
      memset(position_block,0,REGION_LENGTH*sizeof(UINT2));
      k = 0;
      for (masked = 0; masked < oligospace; masked++) {
	positions[masked] = Uintlist_reverse(positions[masked]);
	for (p = positions[masked]; p != NULL; p = Uintlist_next(p)) {
	  position_block[k++] = (UINT2) Uintlist_head(p);
	}
	Uintlist_free(&positions[masked]);
      }
      FWRITE_USHORTS(position_block,REGION_LENGTH,region_fp);
      memset(positions,0,oligospace*sizeof(Uintlist_T));
    }
  }

  if ((adjposition = position - region1part + 1) % REGION_LENGTH != 0) {
    /* Write offsets */
    for (i = 1; i < oligospace; i++) {
      offsets[i] = offsets[i-1] + offsets[i];
    }
    FWRITE_USHORTS(offsets,oligospace,region_fp);
    /* memset(offsets,0,oligospace*sizeof(UINT2)); */

    /* Write positions */
    memset(position_block,0,REGION_LENGTH*sizeof(UINT2));
    k = 0;
    for (masked = 0; masked < oligospace; masked++) {
      positions[masked] = Uintlist_reverse(positions[masked]);
      for (p = positions[masked]; p != NULL; p = Uintlist_next(p)) {
	position_block[k++] = (UINT2) Uintlist_head(p);
      }
      Uintlist_free(&positions[masked]);
    }
    FWRITE_USHORTS(position_block,REGION_LENGTH,region_fp);
    /* memset(positions,0,oligospace*sizeof(Uintlist_T)); */
  }

  fclose(region_fp);

  FREE(offsets);
  FREE(positions);

  fprintf(stderr,"Done\n");

  return;
}


typedef struct Region_T *Region_T;
struct Region_T {
  UINT2 *offsets;		/* Offsets range from 0..4^6 */
  UINT2 *positions;		/* Positions range from 0..65535 */
};


static void
Region_free (Region_T *old) {
  FREE((*old)->positions);
  FREE((*old)->offsets);
  FREE(*old);
  return;
}

static Region_T
Region_new (size_t offsets_size) {
  Region_T new = (Region_T) MALLOC(sizeof(*new));

  new->offsets = (UINT2 *) CALLOC(offsets_size,sizeof(UINT2));
  new->positions = (UINT2 *) CALLOC(REGION_LENGTH,sizeof(UINT2));

  return new;
}

static void
Region_clear (Region_T this, size_t offsets_size) {
  memset(this->offsets,0,offsets_size*sizeof(UINT2));
  memset(this->positions,0,REGION_LENGTH*sizeof(UINT2));
  return;
}

static void
Region_copy (Region_T dest, Region_T source, size_t offsets_size) {
  memcpy(dest->offsets,source->offsets,offsets_size*sizeof(UINT2));
  memcpy(dest->positions,source->positions,REGION_LENGTH*sizeof(UINT2));
  return;
}


static void
Region_read (Region_T buffer, FILE *fp, size_t offsets_size) {
  FREAD_USHORTS(buffer->offsets,offsets_size,fp);
  FREAD_USHORTS(buffer->positions,REGION_LENGTH,fp);
  return;
}

static void
Region_write (FILE *out, Region_T current, size_t offsets_size) {
  FWRITE_USHORTS(current->offsets,offsets_size,out);
  FWRITE_USHORTS(current->positions,REGION_LENGTH,out);
  return;
}

static void
Region_append (Region_T current, Region_T buffer, Region_T temp, int current_pos,
	       int bufferlen, size_t offsets_size) {
  UINT2 oligo;
  UINT2 *p, position;
  int ptr0, end0, ptr1, end1, ptr;	/* Have to use int and not UINT2, because values could be 65536 */

  Region_copy(temp,current,offsets_size);

  p = &(current->positions[0]);
  for (oligo = 0; oligo < offsets_size; oligo++) {
    ptr0 = (oligo == 0) ? 0 : temp->offsets[oligo - 1];
    if ((end0 = temp->offsets[oligo]) < ptr0) {
      end0 = REGION_LENGTH;
    }

    ptr1 = (oligo == 0) ? 0 : buffer->offsets[oligo - 1];
    if ((end1 = buffer->offsets[oligo]) < ptr1) {
      end1 = REGION_LENGTH;
    }

    for (ptr = ptr0; ptr < end0; ptr++) {
      position = temp->positions[ptr];
      *p++ = position;
    }
    for (ptr = ptr1; ptr < end1; ptr++) {
      if ((position = buffer->positions[ptr]) < bufferlen) {
	*p++ = position + current_pos;
      }
    }

    current->offsets[oligo] = p - &(current->positions[0]);
  }

  while (p < &(current->positions[REGION_LENGTH])) {
    *p++ = 0;
  }

  return;
}


static void
Region_remainder (Region_T current, Region_T buffer, int shift, size_t offsets_size) {
  UINT2 oligo;
  UINT2 *p, position;
  int ptr1, end1, ptr;	/* Have to use int and not UINT2, because values could be 65536 */

  p = &(current->positions[0]);
  for (oligo = 0; oligo < offsets_size; oligo++) {
    ptr1 = (oligo == 0) ? 0 : buffer->offsets[oligo - 1];
    if ((end1 = buffer->offsets[oligo]) < ptr1) {
      end1 = REGION_LENGTH;
    }

    for (ptr = ptr1; ptr < end1; ptr++) {
      if ((position = buffer->positions[ptr]) >= shift) {
	*p++ = position - shift;
      }
    }

    current->offsets[oligo] = p - &(current->positions[0]);
  }

  while (p < &(current->positions[REGION_LENGTH])) {
    *p++ = 0;
  }

  return;
}


/* A region has offsets_size (4^6 = 4096) offsets, followed by REGION_LENGTH positions */
void
Regiondb_cat (FILE *out, char **files, Univcoord_T *genomelengths, int nfiles, Width_T region1part) {
  FILE *fp;
  char *file;
  size_t offsets_size = 0;
  unsigned int current_pos = 0, bufferlen;
  Region_T current, buffer, temp;
#ifdef DEBUG1
  int k;
#endif

#ifdef DEBUG1
  printf("nfiles: %d\n",nfiles);
  printf("genomelengths");
  for (k = 0; k < nfiles; k++) {
    printf(" %llu",genomelengths[k]);
  }
  printf("\n");
#endif

  offsets_size = power(4,region1part);

  current = Region_new(offsets_size);
  buffer = Region_new(offsets_size);
  temp = Region_new(offsets_size);

  if ((fp = fopen((file = *files++),"rb")) == NULL) {
    fprintf(stderr,"Regiondb file %s is not valid\n",file);
    exit(9);
  } else {
    debug1(printf("*** Opening regiondb file %s\n",file));
  }
    
  while (fp != NULL) {
#ifdef DEBUG1
    printf("genomelengths");
    for (k = 0; k < nfiles; k++) {
      printf(" %llu",genomelengths[k]);
    }
    printf("\n");
#endif

    debug1(printf("Reading buffer\n"));
    Region_read(buffer,fp,offsets_size);

    if (*genomelengths <= REGION_LENGTH) {
      bufferlen = *genomelengths;
      *genomelengths = 0;

      fclose(fp);
      if (--nfiles == 0) {
	fp = (FILE *) NULL;
      } else if ((fp = fopen((file = *files++),"rb")) == NULL) {
	fprintf(stderr,"Regiondb file %s is not valid\n",file);
	exit(9);
      } else {
	debug1(printf("*** Opening regiondb file %s\n",file));
	genomelengths++;
      }

    } else {
      bufferlen = REGION_LENGTH;
      *genomelengths -= REGION_LENGTH;
    }

    debug1(printf("current_pos %u\n",current_pos));
    if (current_pos + bufferlen < REGION_LENGTH) {
      debug1(printf("Case 1\n"));
      debug1(printf("Appending buffer with bufferlen %d\n",bufferlen));
      Region_append(current,buffer,temp,current_pos,bufferlen,offsets_size);
      current_pos += bufferlen;

    } else if (current_pos + bufferlen == REGION_LENGTH) {
      debug1(printf("Case 2\n"));
      debug1(printf("Appending buffer with bufferlen %d\n",bufferlen));
      Region_append(current,buffer,temp,current_pos,bufferlen,offsets_size);
      debug1(printf("Writing current\n"));
      Region_write(out,current,offsets_size);
      Region_clear(current,offsets_size);
      current_pos = 0;

    } else {
      debug1(printf("Case 3\n"));
      debug1(printf("Appending buffer with truncated bufferlen %u\n",REGION_LENGTH-current_pos));
      Region_append(current,buffer,temp,current_pos,
		    /*bufferlen*/REGION_LENGTH-current_pos,offsets_size);
      debug1(printf("Writing current\n"));
      Region_write(out,current,offsets_size);
      debug1(printf("Saving remainder with shift %d\n",REGION_LENGTH-current_pos));
      Region_remainder(current,buffer,/*shift*/REGION_LENGTH-current_pos,offsets_size);
      current_pos = current_pos + bufferlen - REGION_LENGTH;
    }
    debug1(printf("now current_pos %u\n",current_pos));
  }

  if (current_pos > 0) {
    Region_write(out,current,offsets_size);
  }

  Region_free(&temp);
  Region_free(&buffer);
  Region_free(&current);

  return;
}

