141 lines
3.5 KiB
Go
141 lines
3.5 KiB
Go
package remotefileupload
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"io"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"fiskerinc.com/modules/utils/envtool"
|
|
)
|
|
|
|
var (
|
|
parquetFileSizeIdeal = int64(envtool.GetEnvInt("PARQUET_FILE_SIZE_IN_COMPRESSED", 1024*1024*200))
|
|
skipAzure = false
|
|
)
|
|
|
|
type ICSVtoParquet interface {
|
|
Read(io.Reader) error
|
|
Write() error
|
|
}
|
|
|
|
type csvToParquet struct {
|
|
azureAccount string
|
|
azureAccountKey string
|
|
queue chan string
|
|
parquetBlobPath string
|
|
counter int
|
|
}
|
|
|
|
func NewCSVtoParquet(azureAccount, azureAccountKey, parquetBlobUrl string) ICSVtoParquet {
|
|
return &csvToParquet{
|
|
azureAccount: azureAccount,
|
|
azureAccountKey: azureAccountKey,
|
|
queue: make(chan string, 20),
|
|
parquetBlobPath: getPathFromURL(parquetBlobUrl),
|
|
}
|
|
}
|
|
|
|
// Read reads lines from the provided io.Reader and sends them to a buffered channel.
|
|
// The function uses a bufio.Reader to efficiently read lines until it encounters an EOF (end of file).
|
|
// Each read line is sent to a pre-initialized buffered channel 'queue' for further processing.
|
|
// The channel is closed once all lines are read or if an error occurs during the process.
|
|
//
|
|
// Parameters:
|
|
//
|
|
// reader (io.Reader): The input stream from which lines are read.
|
|
//
|
|
// Returns:
|
|
//
|
|
// error: If an error occurs during reading, it is returned. Otherwise, returns nil.
|
|
func (cp *csvToParquet) Read(reader io.Reader) error {
|
|
defer close(cp.queue) // Close the channel when file done.
|
|
bio := bufio.NewReader(reader)
|
|
for {
|
|
line, err := bio.ReadString('\n')
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
cp.queue <- line
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (cp *csvToParquet) newWriter() (ParquetBlobWriter, error) {
|
|
if skipAzure {
|
|
cp.generateFile()
|
|
return NewFakeAzureParquetBlobWriter()
|
|
}
|
|
return NewAzureParquetBlobWriter(cp.generateFile(), cp.azureAccount, cp.azureAccountKey)
|
|
}
|
|
|
|
func (cp *csvToParquet) Write() error {
|
|
var writer ParquetBlobWriter
|
|
var err error
|
|
writer, err = cp.newWriter()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
defer func() {
|
|
writer.Close()
|
|
}()
|
|
|
|
for line := range cp.queue {
|
|
splitedRaw := strings.Split(line, ",")
|
|
if len(splitedRaw) < 3 {
|
|
continue
|
|
}
|
|
timeStamp, _ := strconv.ParseInt(splitedRaw[0], 10, 64)
|
|
idAs64, _ := strconv.ParseInt(splitedRaw[1], 10, 32)
|
|
id := int32(idAs64)
|
|
|
|
payload := ParquetCANMessage{
|
|
TimestampUSec: &timeStamp,
|
|
ID: &id,
|
|
Data: &splitedRaw[2],
|
|
}
|
|
err = writer.Write(payload)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// if size is greater to 200MB, start writing in new file to avoid memory issue
|
|
if writer.Size() >= parquetFileSizeIdeal {
|
|
writer.Close()
|
|
writer, err = cp.newWriter()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (cp *csvToParquet) generateFile() string {
|
|
file := fmt.Sprintf("%v/%v-%d.parquet", cp.parquetBlobPath, "raw", cp.counter)
|
|
cp.counter++
|
|
return file
|
|
}
|
|
|
|
// getPathFromURL takes a file path as input and returns the path without the file name.
|
|
// It splits the input path using "/" as the separator, removes the last element (file name),
|
|
// and then joins the remaining elements back together with "/" as the separator.
|
|
// If the input is an empty string or contains only the root directory, the function returns an empty string.
|
|
//
|
|
// Parameters:
|
|
//
|
|
// file (string): The input file path from which to extract the directory path.
|
|
//
|
|
// Returns:
|
|
//
|
|
// string: The directory path without the file name.
|
|
func getPathFromURL(file string) string {
|
|
splitPath := strings.Split(file, "/")
|
|
splitPath = splitPath[:len(splitPath)-1]
|
|
return strings.Join(splitPath, "/")
|
|
}
|