cloud-services/pkg/remotefileupload/csvtoparquet.go

package remotefileupload

import (
	"bufio"
	"fmt"
	"io"
	"strconv"
	"strings"

	"fiskerinc.com/modules/utils/envtool"
)

var (
	parquetFileSizeIdeal = int64(envtool.GetEnvInt("PARQUET_FILE_SIZE_IN_COMPRESSED", 1024*1024*200))
	skipAzure            = false
)

type ICSVtoParquet interface {
	Read(io.Reader) error
	Write() error
}

type csvToParquet struct {
	azureAccount    string
	azureAccountKey string
	queue           chan string
	parquetBlobPath string
	counter         int
}

func NewCSVtoParquet(azureAccount, azureAccountKey, parquetBlobUrl string) ICSVtoParquet {
	return &csvToParquet{
		azureAccount:    azureAccount,
		azureAccountKey: azureAccountKey,
		queue:           make(chan string, 20),
		parquetBlobPath: getPathFromURL(parquetBlobUrl),
	}
}

// Read reads lines from the provided io.Reader and sends them to a buffered channel.
// The function uses a bufio.Reader to efficiently read lines until it encounters an EOF (end of file).
// Each read line is sent to a pre-initialized buffered channel 'queue' for further processing.
// The channel is closed once all lines are read or if an error occurs during the process.
//
// Parameters:
//
//	reader (io.Reader): The input stream from which lines are read.
//
// Returns:
//
//	error: If an error occurs during reading, it is returned. Otherwise, returns nil.
func (cp *csvToParquet) Read(reader io.Reader) error {
	defer close(cp.queue) // Close the channel when file done.
	bio := bufio.NewReader(reader)
	for {
		line, err := bio.ReadString('\n')
		if err == io.EOF {
			break
		}
		if err != nil {
			return err
		}
		cp.queue <- line
	}
	return nil
}

func (cp *csvToParquet) newWriter() (ParquetBlobWriter, error) {
	if skipAzure {
		cp.generateFile()
		return NewFakeAzureParquetBlobWriter()
	}
	return NewAzureParquetBlobWriter(cp.generateFile(), cp.azureAccount, cp.azureAccountKey)
}

func (cp *csvToParquet) Write() error {
	var writer ParquetBlobWriter
	var err error
	writer, err = cp.newWriter()
	if err != nil {
		return err
	}

	defer func() {
		writer.Close()
	}()

	for line := range cp.queue {
		splitedRaw := strings.Split(line, ",")
		if len(splitedRaw) < 3 {
			continue
		}
		timeStamp, _ := strconv.ParseInt(splitedRaw[0], 10, 64)
		idAs64, _ := strconv.ParseInt(splitedRaw[1], 10, 32)
		id := int32(idAs64)

		payload := ParquetCANMessage{
			TimestampUSec: &timeStamp,
			ID:            &id,
			Data:          &splitedRaw[2],
		}
		err = writer.Write(payload)
		if err != nil {
			return err
		}
		// if size is greater to 200MB, start writing in new file to avoid memory issue
		if writer.Size() >= parquetFileSizeIdeal {
			writer.Close()
			writer, err = cp.newWriter()
			if err != nil {
				return err
			}
		}
	}
	return nil
}

func (cp *csvToParquet) generateFile() string {
	file := fmt.Sprintf("%v/%v-%d.parquet", cp.parquetBlobPath, "raw", cp.counter)
	cp.counter++
	return file
}

// getPathFromURL takes a file path as input and returns the path without the file name.
// It splits the input path using "/" as the separator, removes the last element (file name),
// and then joins the remaining elements back together with "/" as the separator.
// If the input is an empty string or contains only the root directory, the function returns an empty string.
//
// Parameters:
//
//	file (string): The input file path from which to extract the directory path.
//
// Returns:
//
//	string: The directory path without the file name.
func getPathFromURL(file string) string {
	splitPath := strings.Split(file, "/")
	splitPath = splitPath[:len(splitPath)-1]
	return strings.Join(splitPath, "/")
}