Initial cloud-services repo - gateway service + pkg modules
This commit is contained in:
140
pkg/remotefileupload/csvtoparquet.go
Normal file
140
pkg/remotefileupload/csvtoparquet.go
Normal file
@@ -0,0 +1,140 @@
|
||||
package remotefileupload
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"fiskerinc.com/modules/utils/envtool"
|
||||
)
|
||||
|
||||
var (
|
||||
parquetFileSizeIdeal = int64(envtool.GetEnvInt("PARQUET_FILE_SIZE_IN_COMPRESSED", 1024*1024*200))
|
||||
skipAzure = false
|
||||
)
|
||||
|
||||
type ICSVtoParquet interface {
|
||||
Read(io.Reader) error
|
||||
Write() error
|
||||
}
|
||||
|
||||
type csvToParquet struct {
|
||||
azureAccount string
|
||||
azureAccountKey string
|
||||
queue chan string
|
||||
parquetBlobPath string
|
||||
counter int
|
||||
}
|
||||
|
||||
func NewCSVtoParquet(azureAccount, azureAccountKey, parquetBlobUrl string) ICSVtoParquet {
|
||||
return &csvToParquet{
|
||||
azureAccount: azureAccount,
|
||||
azureAccountKey: azureAccountKey,
|
||||
queue: make(chan string, 20),
|
||||
parquetBlobPath: getPathFromURL(parquetBlobUrl),
|
||||
}
|
||||
}
|
||||
|
||||
// Read reads lines from the provided io.Reader and sends them to a buffered channel.
|
||||
// The function uses a bufio.Reader to efficiently read lines until it encounters an EOF (end of file).
|
||||
// Each read line is sent to a pre-initialized buffered channel 'queue' for further processing.
|
||||
// The channel is closed once all lines are read or if an error occurs during the process.
|
||||
//
|
||||
// Parameters:
|
||||
//
|
||||
// reader (io.Reader): The input stream from which lines are read.
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// error: If an error occurs during reading, it is returned. Otherwise, returns nil.
|
||||
func (cp *csvToParquet) Read(reader io.Reader) error {
|
||||
defer close(cp.queue) // Close the channel when file done.
|
||||
bio := bufio.NewReader(reader)
|
||||
for {
|
||||
line, err := bio.ReadString('\n')
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cp.queue <- line
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (cp *csvToParquet) newWriter() (ParquetBlobWriter, error) {
|
||||
if skipAzure {
|
||||
cp.generateFile()
|
||||
return NewFakeAzureParquetBlobWriter()
|
||||
}
|
||||
return NewAzureParquetBlobWriter(cp.generateFile(), cp.azureAccount, cp.azureAccountKey)
|
||||
}
|
||||
|
||||
func (cp *csvToParquet) Write() error {
|
||||
var writer ParquetBlobWriter
|
||||
var err error
|
||||
writer, err = cp.newWriter()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
writer.Close()
|
||||
}()
|
||||
|
||||
for line := range cp.queue {
|
||||
splitedRaw := strings.Split(line, ",")
|
||||
if len(splitedRaw) < 3 {
|
||||
continue
|
||||
}
|
||||
timeStamp, _ := strconv.ParseInt(splitedRaw[0], 10, 64)
|
||||
idAs64, _ := strconv.ParseInt(splitedRaw[1], 10, 32)
|
||||
id := int32(idAs64)
|
||||
|
||||
payload := ParquetCANMessage{
|
||||
TimestampUSec: &timeStamp,
|
||||
ID: &id,
|
||||
Data: &splitedRaw[2],
|
||||
}
|
||||
err = writer.Write(payload)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// if size is greater to 200MB, start writing in new file to avoid memory issue
|
||||
if writer.Size() >= parquetFileSizeIdeal {
|
||||
writer.Close()
|
||||
writer, err = cp.newWriter()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (cp *csvToParquet) generateFile() string {
|
||||
file := fmt.Sprintf("%v/%v-%d.parquet", cp.parquetBlobPath, "raw", cp.counter)
|
||||
cp.counter++
|
||||
return file
|
||||
}
|
||||
|
||||
// getPathFromURL takes a file path as input and returns the path without the file name.
|
||||
// It splits the input path using "/" as the separator, removes the last element (file name),
|
||||
// and then joins the remaining elements back together with "/" as the separator.
|
||||
// If the input is an empty string or contains only the root directory, the function returns an empty string.
|
||||
//
|
||||
// Parameters:
|
||||
//
|
||||
// file (string): The input file path from which to extract the directory path.
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// string: The directory path without the file name.
|
||||
func getPathFromURL(file string) string {
|
||||
splitPath := strings.Split(file, "/")
|
||||
splitPath = splitPath[:len(splitPath)-1]
|
||||
return strings.Join(splitPath, "/")
|
||||
}
|
||||
Reference in New Issue
Block a user