package remotefileupload import ( "bufio" "fmt" "io" "strconv" "strings" "fiskerinc.com/modules/utils/envtool" ) var ( parquetFileSizeIdeal = int64(envtool.GetEnvInt("PARQUET_FILE_SIZE_IN_COMPRESSED", 1024*1024*200)) skipAzure = false ) type ICSVtoParquet interface { Read(io.Reader) error Write() error } type csvToParquet struct { azureAccount string azureAccountKey string queue chan string parquetBlobPath string counter int } func NewCSVtoParquet(azureAccount, azureAccountKey, parquetBlobUrl string) ICSVtoParquet { return &csvToParquet{ azureAccount: azureAccount, azureAccountKey: azureAccountKey, queue: make(chan string, 20), parquetBlobPath: getPathFromURL(parquetBlobUrl), } } // Read reads lines from the provided io.Reader and sends them to a buffered channel. // The function uses a bufio.Reader to efficiently read lines until it encounters an EOF (end of file). // Each read line is sent to a pre-initialized buffered channel 'queue' for further processing. // The channel is closed once all lines are read or if an error occurs during the process. // // Parameters: // // reader (io.Reader): The input stream from which lines are read. // // Returns: // // error: If an error occurs during reading, it is returned. Otherwise, returns nil. func (cp *csvToParquet) Read(reader io.Reader) error { defer close(cp.queue) // Close the channel when file done. bio := bufio.NewReader(reader) for { line, err := bio.ReadString('\n') if err == io.EOF { break } if err != nil { return err } cp.queue <- line } return nil } func (cp *csvToParquet) newWriter() (ParquetBlobWriter, error) { if skipAzure { cp.generateFile() return NewFakeAzureParquetBlobWriter() } return NewAzureParquetBlobWriter(cp.generateFile(), cp.azureAccount, cp.azureAccountKey) } func (cp *csvToParquet) Write() error { var writer ParquetBlobWriter var err error writer, err = cp.newWriter() if err != nil { return err } defer func() { writer.Close() }() for line := range cp.queue { splitedRaw := strings.Split(line, ",") if len(splitedRaw) < 3 { continue } timeStamp, _ := strconv.ParseInt(splitedRaw[0], 10, 64) idAs64, _ := strconv.ParseInt(splitedRaw[1], 10, 32) id := int32(idAs64) payload := ParquetCANMessage{ TimestampUSec: &timeStamp, ID: &id, Data: &splitedRaw[2], } err = writer.Write(payload) if err != nil { return err } // if size is greater to 200MB, start writing in new file to avoid memory issue if writer.Size() >= parquetFileSizeIdeal { writer.Close() writer, err = cp.newWriter() if err != nil { return err } } } return nil } func (cp *csvToParquet) generateFile() string { file := fmt.Sprintf("%v/%v-%d.parquet", cp.parquetBlobPath, "raw", cp.counter) cp.counter++ return file } // getPathFromURL takes a file path as input and returns the path without the file name. // It splits the input path using "/" as the separator, removes the last element (file name), // and then joins the remaining elements back together with "/" as the separator. // If the input is an empty string or contains only the root directory, the function returns an empty string. // // Parameters: // // file (string): The input file path from which to extract the directory path. // // Returns: // // string: The directory path without the file name. func getPathFromURL(file string) string { splitPath := strings.Split(file, "/") splitPath = splitPath[:len(splitPath)-1] return strings.Join(splitPath, "/") }