Files
cloud-services/pkg/remotefileupload/csvtoparquet.go

141 lines
3.5 KiB
Go

package remotefileupload
import (
"bufio"
"fmt"
"io"
"strconv"
"strings"
"fiskerinc.com/modules/utils/envtool"
)
var (
parquetFileSizeIdeal = int64(envtool.GetEnvInt("PARQUET_FILE_SIZE_IN_COMPRESSED", 1024*1024*200))
skipAzure = false
)
type ICSVtoParquet interface {
Read(io.Reader) error
Write() error
}
type csvToParquet struct {
azureAccount string
azureAccountKey string
queue chan string
parquetBlobPath string
counter int
}
func NewCSVtoParquet(azureAccount, azureAccountKey, parquetBlobUrl string) ICSVtoParquet {
return &csvToParquet{
azureAccount: azureAccount,
azureAccountKey: azureAccountKey,
queue: make(chan string, 20),
parquetBlobPath: getPathFromURL(parquetBlobUrl),
}
}
// Read reads lines from the provided io.Reader and sends them to a buffered channel.
// The function uses a bufio.Reader to efficiently read lines until it encounters an EOF (end of file).
// Each read line is sent to a pre-initialized buffered channel 'queue' for further processing.
// The channel is closed once all lines are read or if an error occurs during the process.
//
// Parameters:
//
// reader (io.Reader): The input stream from which lines are read.
//
// Returns:
//
// error: If an error occurs during reading, it is returned. Otherwise, returns nil.
func (cp *csvToParquet) Read(reader io.Reader) error {
defer close(cp.queue) // Close the channel when file done.
bio := bufio.NewReader(reader)
for {
line, err := bio.ReadString('\n')
if err == io.EOF {
break
}
if err != nil {
return err
}
cp.queue <- line
}
return nil
}
func (cp *csvToParquet) newWriter() (ParquetBlobWriter, error) {
if skipAzure {
cp.generateFile()
return NewFakeAzureParquetBlobWriter()
}
return NewAzureParquetBlobWriter(cp.generateFile(), cp.azureAccount, cp.azureAccountKey)
}
func (cp *csvToParquet) Write() error {
var writer ParquetBlobWriter
var err error
writer, err = cp.newWriter()
if err != nil {
return err
}
defer func() {
writer.Close()
}()
for line := range cp.queue {
splitedRaw := strings.Split(line, ",")
if len(splitedRaw) < 3 {
continue
}
timeStamp, _ := strconv.ParseInt(splitedRaw[0], 10, 64)
idAs64, _ := strconv.ParseInt(splitedRaw[1], 10, 32)
id := int32(idAs64)
payload := ParquetCANMessage{
TimestampUSec: &timeStamp,
ID: &id,
Data: &splitedRaw[2],
}
err = writer.Write(payload)
if err != nil {
return err
}
// if size is greater to 200MB, start writing in new file to avoid memory issue
if writer.Size() >= parquetFileSizeIdeal {
writer.Close()
writer, err = cp.newWriter()
if err != nil {
return err
}
}
}
return nil
}
func (cp *csvToParquet) generateFile() string {
file := fmt.Sprintf("%v/%v-%d.parquet", cp.parquetBlobPath, "raw", cp.counter)
cp.counter++
return file
}
// getPathFromURL takes a file path as input and returns the path without the file name.
// It splits the input path using "/" as the separator, removes the last element (file name),
// and then joins the remaining elements back together with "/" as the separator.
// If the input is an empty string or contains only the root directory, the function returns an empty string.
//
// Parameters:
//
// file (string): The input file path from which to extract the directory path.
//
// Returns:
//
// string: The directory path without the file name.
func getPathFromURL(file string) string {
splitPath := strings.Split(file, "/")
splitPath = splitPath[:len(splitPath)-1]
return strings.Join(splitPath, "/")
}