Initial cloud-services repo - gateway service + pkg modules

This commit is contained in:
Chris Rai
2026-01-30 23:14:52 -05:00
commit fbb820d7b3
1037 changed files with 171318 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
Contains code from cloud/cargo/handlers/events.go
Needed for cloud/valet/handlers/log_trex.go
So code doesn't need to be in two different places

View File

@@ -0,0 +1,76 @@
package remotefileupload
import (
"bytes"
"fmt"
"net/http"
"path/filepath"
"fiskerinc.com/modules/logger"
"fiskerinc.com/modules/utils/envtool"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/s3"
)
var (
awsBucketRegion = envtool.GetEnv("AWS_REGION", "us-west-2")
awsBucketName = envtool.GetEnv("AWS_BUCKET_NAME", "fisker-data-test")
awsFileExtension = envtool.GetEnv("AWS_FILE_EXTENSION", ".csv")
)
// NewS3Uploader creates a new S3Uploader instance using env variables
// requires ENV vars: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN
func NewS3Uploader(awsBucketName string) Uploader {
var uploader *S3Uploader
cfg := aws.NewConfig().WithRegion(awsBucketRegion)
sess := session.Must(session.NewSession())
uploader = &S3Uploader{
service: s3.New(sess, cfg),
bucketRegion: awsBucketRegion,
bucketName: awsBucketName,
fileExtension: awsFileExtension,
}
return uploader
}
type S3Uploader struct {
service *s3.S3
bucketRegion string
bucketName string
fileExtension string
}
// Upload creates a new object within S3 under the designated path
//
// objects can be up to 15MB before requiring multi-upload
func (s *S3Uploader) Upload(block []byte, logValue LogPayload, filePath ...string) (string, error) {
fileType := http.DetectContentType(block)
path := s.s3ObjectURL(filePath)
input := &s3.PutObjectInput{
Bucket: aws.String(s.bucketName),
Key: aws.String(path),
ContentType: aws.String(fileType),
Body: bytes.NewReader(block),
}
logger.Debug().Str(logValue.Title, logValue.Value).Msgf("sending block of length %d to aws object: %s", len(block), path)
_, err := s.service.PutObject(input)
if err != nil {
return "", err
}
logger.Debug().Str(logValue.Title, logValue.Value).Msgf("upload complete")
return path, nil
}
// s3ObjectURL is the URL formatter for an S3 object
func (s *S3Uploader) s3ObjectURL(filePath []string) string {
fileName := fmt.Sprintf("%s%s", "raw", s.fileExtension)
finalPath := filepath.Join(filepath.Join(filePath...), fileName)
return finalPath
}

View File

@@ -0,0 +1,30 @@
package remotefileupload_test
import (
"testing"
"fiskerinc.com/modules/remotefileupload"
"fiskerinc.com/modules/utils/envtool"
"fiskerinc.com/modules/testhelper"
)
func TestNewAWSUploaderIntegration(t *testing.T) {
t.Skip()
awsBucketName := envtool.GetEnv("AWS_BUCKET_NAME", "fisker-data-test")
_ = remotefileupload.NewS3Uploader(awsBucketName)
}
func TestAWSUploadIntegration(t *testing.T) {
t.Skip()
awsBucketName := envtool.GetEnv("AWS_BUCKET_NAME", "fisker-data-test")
s := remotefileupload.NewS3Uploader(awsBucketName)
_, err := s.Upload([]byte("testblock"), remotefileupload.LogPayload{Title: "vin", Value: "TESTVIN123"}, "TESTVIN123", "TESTVERSION123")
if err != nil {
t.Errorf(testhelper.TestErrorTemplate, "TestAzureUploadIntegration", "error", err)
return
}
}

View File

@@ -0,0 +1,123 @@
package remotefileupload
import (
"bytes"
"context"
"fmt"
"io"
"net/url"
"fiskerinc.com/modules/logger"
"fiskerinc.com/modules/utils/envtool"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/appendblob"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/bloberror"
"github.com/pkg/errors"
)
var (
azureAccount = envtool.GetEnv("AZURE_STORAGE_ACCOUNT", "REPLACE_ME")
azureAccountKey = envtool.GetEnv("AZURE_STORAGE_ACCESS_KEY", "REPLACE_ME")
)
// NewAzureUploader creates a new AzureUploader instance using env variables
func NewAzureUploader(azureStorageContainerName string, azureFileExtension string) (Uploader, error) {
a := &AzureUploader{
accountName: azureAccount,
containerName: azureStorageContainerName,
fileExtension: azureFileExtension,
}
cred, err := azblob.NewSharedKeyCredential(a.accountName, azureAccountKey)
if err != nil {
return a, errors.WithStack(err)
}
containerPath := fmt.Sprintf("https://%s.blob.core.windows.net/%s/", a.accountName, a.containerName)
a.containerPath = containerPath
a.azureCredentials = cred
return a, nil
}
// AzureUploader stores file location and creds to perform AppendBlock operation to blobs
type AzureUploader struct {
accountName string
containerName string
fileExtension string
containerPath string
azureCredentials *azblob.SharedKeyCredential
}
// Upload appends new chunk of data to end of blob
// if blob doesn't exist, creates blob and then appends data
// logName: A name for if something goes wrong: i.e.: filePath[logIndex]
// logIndex: What piece of data should be logged if something goes wrong
func (a *AzureUploader) Upload(block []byte, logValue LogPayload, filePath ...string) (string, error) {
ctx := context.Background()
blobURL := a.azureBlobURL(a.containerPath, filePath)
client, err := appendblob.NewClientWithSharedKeyCredential(blobURL, a.azureCredentials, nil)
if err != nil {
return "", err
}
logger.Debug().Str(logValue.Title, logValue.Value).Msgf("sending block of length %d to azure container: %s", len(block), blobURL)
reader := NopCloser(bytes.NewReader(block))
func() {
// Instead of trying to send data to a blob, and then determining if it exists, lets just check if it exists
_, err := client.GetProperties(ctx, nil)
if err != nil {
if !bloberror.HasCode(err, bloberror.BlobNotFound) {
logger.Error().Str(logValue.Title, logValue.Value).Err(err).Send()
return
}
_, err = client.Create(ctx, nil)
if err != nil {
logger.Error().Str(logValue.Title, logValue.Value).Err(err).Send()
return
}
}
_, err = client.AppendBlock(ctx, reader, nil)
if err != nil {
logger.Error().Str(logValue.Title, logValue.Value).Err(err).Send()
return
}
logger.Debug().Str(logValue.Title, logValue.Value).Msgf("upload complete")
}()
return blobURL, nil
}
// basePath is the url to the blob storage (<account>.azurebloburl.net/<containername>)
// filepath will be added onto basepath /<your>/<file>/<path>
func (a *AzureUploader) azureBlobURL(basePath string, filePath []string) string {
fileName := fmt.Sprintf("%s%s", "raw", a.fileExtension)
finalPath, _ := url.JoinPath(basePath, filePath...)
finalPath, _ = url.JoinPath(finalPath, fileName)
return finalPath
}
type nopCloser struct {
io.ReadSeeker
}
func (n nopCloser) Close() error {
return nil
}
// NopCloser returns a ReadSeekCloser with a no-op close method wrapping the provided io.ReadSeeker.
func NopCloser(rs io.ReadSeeker) io.ReadSeekCloser {
return nopCloser{rs}
}
// Azure Account is the whole storage account name such as fiskercloudstg
// AzureCotnainerName is the name of the specific container such as trexlogs
// Then the path is the path to the file i.e.: "someVIN", "2023", "05", "03", "raw.log"
func AzureFilePathLink(AzureAccount, AzureContainerName string, PathPieces ...string) (link string, err error) {
link = fmt.Sprintf("https://%s.blob.core.windows.net/%s", AzureAccount, AzureContainerName)
return url.JoinPath(link, PathPieces...)
}

View File

@@ -0,0 +1,63 @@
package remotefileupload
import (
"math/rand"
"strconv"
"testing"
"time"
)
// without the goroutine, only sends 11
func BenchmarkAzureUpload(b *testing.B) {
azureAccount = "fiskerclouddev"
azureAccountKey = "REPLACE_ME"
uploader, err := NewAzureUploader("trex-logs", ".txt")
if err != nil {
b.Error(err)
}
benchmarkUploadTime(uploader, b)
}
func BenchmarkAzureBatchUpload(b *testing.B) {
azureAccount = "fiskerclouddev"
azureAccountKey = "REPLACE_ME"
uploader, err := NewAzureBatchUploader("trex-logs", ".txt", 1, "\n")
if err != nil {
b.Error(err)
}
benchmarkUploadTime(uploader, b)
}
// 1762456 messages,
func benchmarkUploadTime(uploader Uploader, b *testing.B) {
endTimer := time.NewTimer(time.Second * 5)
messagesSent := 0
Loop:
for {
select {
case <-endTimer.C:
break Loop
default:
SendMessage(time.Now(), uploader, b)
messagesSent++
}
}
time.Sleep(5 * time.Second)
b.Logf("Benchmark %s 'sent' %d messages\n", b.Name(), messagesSent)
}
func SendMessage(t time.Time, uploader Uploader, b *testing.B) {
thread := rand.Intn(10)
_, err := uploader.Upload([]byte(t.Format(time.RubyDate)), LogPayload{
Title: "This",
Value: "Some",
}, "/benchmark", strconv.Itoa(thread))
if err != nil {
b.Error(b)
}
}

View File

@@ -0,0 +1,87 @@
package remotefileupload_test
import (
"fmt"
"testing"
"time"
"fiskerinc.com/modules/remotefileupload"
"fiskerinc.com/modules/utils/envtool"
"fiskerinc.com/modules/testhelper"
)
// In order to run the integration tests on the uploader, the go routine that makes the upload needs to be not a goroutine
func TestNewAzureUploaderIntegration(t *testing.T) {
t.Skip()
azureContainerName := envtool.GetEnv("AZURE_TREX_LOGS_STORAGE_CONTAINER_NAME", "raw-can")
_, err := remotefileupload.NewAzureUploader(azureContainerName, ".csv")
if err != nil {
t.Errorf(testhelper.TestErrorTemplate, "TestNewAzureUploaderIntegration", nil, err)
return
}
}
func TestAzureUploadIntegration(t *testing.T) {
t.Skip()
azureContainerName := envtool.GetEnv("AZURE_TREX_LOGS_STORAGE_CONTAINER_NAME", "raw-can")
a, err := remotefileupload.NewAzureUploader(azureContainerName, ".csv")
if err != nil {
t.Error(err)
return
}
date := fmt.Sprintf("%04d/%02d/%02d", time.Now().Year(), time.Now().Month(), time.Now().Day())
_, err = a.Upload([]byte("{'id':'testJson'}"), remotefileupload.LogPayload{Title: "vin", Value: "TESTVIN123"}, "TESTVIN123", date)
if err != nil {
t.Error(err)
return
}
}
func TestAzureUploadTestAppend(t *testing.T) {
t.Skip()
a, err := remotefileupload.NewAzureUploader("trex-logs", ".csv")
if err != nil {
t.Fatal(err)
}
_, err = a.Upload([]byte("Hello"), remotefileupload.LogPayload{Title: "vin", Value: "path"}, "path")
if err != nil {
t.Fatal(err)
}
_, err = a.Upload([]byte("goodbye"), remotefileupload.LogPayload{Title: "vin", Value: "path"}, "path")
if err != nil {
t.Fatal(err)
}
_, err = a.Upload([]byte("again"), remotefileupload.LogPayload{Title: "vin", Value: "path"}, "path")
if err != nil {
t.Fatal(err)
}
time.Sleep(time.Minute)
}
func TestURLGeneration(t *testing.T) {
link, err := remotefileupload.AzureFilePathLink("dev-account", "trex-logs", "trex", "12345678", "2022", "log.txt")
if err != nil {
t.Error(err)
}
if link != "https://dev-account.blob.core.windows.net/trex-logs/trex/12345678/2022/log.txt" {
t.Errorf("Link did not match: %s", link)
}
link, err = remotefileupload.AzureFilePathLink("dev-account", "trex-logs", "trex", "/12345678/2022", "log.txt")
if err != nil {
t.Error(err)
}
if link != "https://dev-account.blob.core.windows.net/trex-logs/trex/12345678/2022/log.txt" {
t.Errorf("Link did not match: %s", link)
}
}

View File

@@ -0,0 +1,318 @@
package remotefileupload
import (
"context"
"fmt"
"net/url"
"strings"
"sync"
"time"
"fiskerinc.com/modules/logger"
"fiskerinc.com/modules/utils/elptr"
"fiskerinc.com/modules/utils/envtool"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/policy"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/appendblob"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blockblob"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/sas"
)
var (
backupContainerName = envtool.GetEnv("AZURE_STORAGE_BACKUP_CONTAINER", "raw-can-archive")
ttl = envtool.GetEnvInt64("AZURE_STORAGE_BACKUP_TTL", 60*24) // 60 days
azureRawCompressedContainerName = envtool.GetEnv("AZURE_STORAGE_RAW_COMPRESSED_CONTAINER", "raw-can-compressed")
)
type Backup struct {
azureAccount string
azureAccountKey string
containerName string
cred *azblob.SharedKeyCredential
}
var (
errTTL = "Failed to set ttl %s"
errCopy = "Failed to copy %s"
errDelete = "Failed to delete %s"
errClient = "Failed to create client"
errParquetWriter = "Failed to create parquet wirter %s"
errDownload = "Failed to download file %s"
blobNotExists = "The specified blob does not exist."
)
func NewBackup(azureAccount string, azureAccountKey string, containerName string) *Backup {
bk := &Backup{
azureAccount: azureAccount,
azureAccountKey: azureAccountKey,
containerName: containerName,
}
bk.cred, _ = azblob.NewSharedKeyCredential(azureAccount, azureAccountKey)
return bk
}
// remove deletes a file from Azure Blob Storage.
//
// Parameters:
// - context: context, Backgroud as of now.
// - filePath: The path of the file to be removed.
//
// Returns:
// - err: return err if occur otherwise nil.
//
// Deletes appendblock blob from storage. If the removal operation encounters an error, it logs
// an error message and returns error. Otherwise, it returns nil to indicate
// a successful removal.
func (b *Backup) remove(ctx context.Context, filePath string) error {
// Construct the full path of the file in Azure Blob Storage
fullPath := b.azureBlobURL(b.getContainerPath(b.containerName), filePath)
client, err := appendblob.NewClientWithSharedKeyCredential(fullPath, b.cred, nil)
if err != nil {
return err
}
_, err = client.Delete(context.Background(), nil)
return err
}
// Move, copy a blob from Azure Blob Storage to Azure Blob Storage as cool tier block blob type.
// Set TTL to new blob and remove the original blob
// Parameters:
// - context: context, Backgroud as of now.
// - filePath: path of the src file.
//
// Returns:
// - err: An error, if any, that occurred during the SAS token generation process.
func (b *Backup) Move(ctx context.Context, filePath string) error {
backupPath := filePath
// Construct the full path of the src file in Azure Blob Storage
srcPath := b.azureBlobURL(b.getContainerPath(b.containerName), filePath)
// Construct the full path of the dest file in Azure Blob Storage
destPath := b.azureBlobURL(b.getContainerPath(backupContainerName), backupPath)
// Generate a Shared Access Signature (SAS) token for src file with read permissions
srcSAS, _ := b.generateSASToken(filePath, sas.BlobPermissions{Read: true}, b.containerName)
client, err := blockblob.NewClientWithSharedKeyCredential(destPath, b.cred, &blockblob.ClientOptions{
ClientOptions: policy.ClientOptions{
Retry: policy.RetryOptions{
MaxRetries: 1,
MaxRetryDelay: 1 * time.Minute,
},
},
})
if err != nil {
return err
}
tier := blob.AccessTierCool // Set cool tier type as cold tier not supported for this version of sdk
_, err = client.UploadBlobFromURL(ctx, fmt.Sprintf("%s?%s", srcPath, srcSAS), &blockblob.UploadBlobFromURLOptions{
Tier: &tier,
})
if err != nil && !strings.Contains(err.Error(), blobNotExists) {
logger.Err(err).Msg(fmt.Sprintf(errCopy, srcPath))
return err
}
err = b.setTTL(ctx, destPath)
if err != nil && !strings.Contains(err.Error(), blobNotExists) {
logger.Err(err).Msg(fmt.Sprintf(errTTL, destPath))
}
err = b.remove(ctx, filePath)
if err != nil {
if strings.Contains(err.Error(), blobNotExists) {
return nil
}
logger.Err(err).Msg(fmt.Sprintf(errDelete, destPath))
}
return err
}
// setTTL set a Time-to-Live (TTL) expiration policy to an Azure Blob Storage file.
//
// Parameters:
// - context: context, Backgroud as of now.
// - fileUrl: The URL of the Azure Blob Storage file to which the TTL policy will be added.
//
// Returns:
// - error: An error, if any, that occurred during the TTL policy addition process. It returns nil if successful.
//
// The setTTL function is responsible for adding a Time-to-Live (TTL) expiration policy
// to a specific file located in Azure Blob Storage. A TTL policy allows you to specify
// a duration after which the file will be automatically deleted from storage.
func (b *Backup) setTTL(ctx context.Context, fileUrl string) error {
blockBlobClient, err := blockblob.NewClientWithSharedKeyCredential(fileUrl, b.cred, &blockblob.ClientOptions{
ClientOptions: policy.ClientOptions{
Retry: policy.RetryOptions{
MaxRetries: 1,
MaxRetryDelay: 1 * time.Minute,
},
},
})
if err != nil {
return err
}
// set expiry on block blob 4 hours relative to now
_, err = blockBlobClient.SetExpiry(context.Background(), blockblob.ExpiryTypeRelativeToNow(ttl*int64(time.Hour)), nil)
if err != nil {
return err
}
// validate set expiry operation
resp, err := blockBlobClient.GetProperties(ctx, nil)
if err != nil {
return err
}
if resp.ExpiresOn == nil {
return nil
}
return nil
}
// generateAzureSASToken generates a Shared Access Signature (SAS) token for an Azure Blob Storage blob.
//
// Parameters:
// - blobName: The name of the blob for which the SAS token is generated.
// - permission: The BlobPermissions object specifying the permissions granted by the SAS token.
// - containerName: The containerName of the blob.
//
// Returns:
// - token: The generated SAS token string.
// - err: An error, if any, that occurred during the SAS token generation process.
func (b *Backup) generateSASToken(blobName string, permission sas.BlobPermissions, containerName string) (token string, err error) {
// blob name is something like this: 19UUA56873A044568/2023/01/11/raw.log
cred, err := azblob.NewSharedKeyCredential(b.azureAccount, b.azureAccountKey)
if err != nil {
logger.Err(err).Msg("[backup]:[NewSharedKeyCredential]")
return
}
sasQueryParams, err := sas.BlobSignatureValues{
Protocol: sas.ProtocolHTTPS,
StartTime: time.Now().UTC().Add(-1 * time.Hour), // reduce an hour from current time to avoid signature issue
ExpiryTime: time.Now().UTC().Add(3 * 365 * 24 * time.Hour), // 3 years-ish
Permissions: elptr.ElPtr(permission).String(),
ContainerName: containerName,
BlobName: blobName,
}.SignWithSharedKey(cred)
if err != nil {
logger.Err(err).Msg("Failed to sas.BlobSignatureValues")
return
}
token = sasQueryParams.Encode()
return
}
func (b *Backup) azureBlobURL(basePath string, filePath string) string {
finalPath, _ := url.JoinPath(basePath, filePath)
return finalPath
}
func (b *Backup) getContainerPath(containerName string) string {
return fmt.Sprintf("https://%s.blob.core.windows.net/%s/", b.azureAccount, containerName)
}
// ToParquet converts data from an Azure Blob csv to a Parquet file and stores it in another container.
//
// This function takes an `blobName` representing the source Azure Blob csv and performs the following steps:
//
// 1. Downloads data from the source Azure Blob identified by `blobName`.
// 2. Converts the retrieved data into a Parquet file using Parquet Writer.
//
// Parameters:
// - blobName: The name of the source Azure Blob csv that contains the data to be converted to Parquet.
//
// Returns:
// - error: An error logs and returns if any step of the conversion or storage process encounters an issue. It returns nil on success.
func (b *Backup) ToParquet(blobName string, guard chan struct{}) error {
var err error
srcBlobURL := b.azureBlobURL(b.getContainerPath(backupContainerName), blobName)
parquetBlobName := b.changeFileExt(blobName, "parquet")
parquetBlobURL := b.azureBlobURL(b.getContainerPath(azureRawCompressedContainerName), parquetBlobName)
client, err := blockblob.NewClientWithSharedKeyCredential(srcBlobURL, b.cred, nil)
if err != nil {
if strings.Contains(err.Error(), blobNotExists) {
return nil
}
logger.Err(err).Msg(errClient)
return err
}
downloadResp, err := client.DownloadStream(context.Background(), nil)
if err != nil {
if strings.Contains(err.Error(), blobNotExists) {
return nil
}
logger.Err(err).Msg(fmt.Sprintf(errDownload, srcBlobURL))
return err
}
defer downloadResp.Body.Close()
csvToParquet := NewCSVtoParquet(b.azureAccount, b.azureAccountKey, parquetBlobURL)
guard <- struct{}{} // for reader
go func() {
defer func() {
<-guard
}()
csvToParquet.Read(downloadResp.Body)
}()
var wg sync.WaitGroup
wg.Add(1)
guard <- struct{}{} // for writer
go func(w *sync.WaitGroup) {
defer func() {
w.Done()
<-guard
}()
err = csvToParquet.Write()
if err != nil {
logger.Err(err).Msg(fmt.Sprintf(errParquetWriter, parquetBlobURL))
}
}(&wg)
wg.Wait()
return err
}
// changeFileExt updates the file extension of a given blob name and returns the modified blob name.
//
// This method takes an existing `blobName` and replaces its file extension with the specified `fileExt`.
// It then returns the modified blob name as a string.
//
// Parameters:
// - blobName: The original blob name, including its current file extension.
// - fileExt: The new file extension to replace the existing one. The `fileExt` should not include the dot (e.g., "txt").
//
// Returns:
// - string: The modified blob name with the updated file extension.
func (b *Backup) changeFileExt(blobName, fileExt string) string {
if len(fileExt) > 0 && string(fileExt[0]) == "." {
fileExt = fileExt[1:]
}
if len(blobName) == 0 {
return fmt.Sprintf(".%s", fileExt)
}
arr := strings.Split(blobName, ".")
if len(arr) == 1 {
return fmt.Sprintf("%s.%s", arr[0], fileExt)
}
arr[len(arr)-1] = fileExt
return strings.Join(arr, ".")
}

View File

@@ -0,0 +1,167 @@
package remotefileupload
import (
"context"
"sync"
"testing"
)
var (
guard = make(chan struct{}, 100)
)
func TestAzureBlobURL(t *testing.T) {
backup := NewBackup("", "", "")
var inputs = []struct {
base string
filePath string
expected string
}{
{ // Test case 1: basePath is empty, filePath is empty
base: "",
filePath: "",
expected: "",
},
{ // Test case 2: basePath is not empty, filePath is empty
base: "/base",
filePath: "",
expected: "/base",
},
{ // Test case 3: basePath is not empty, filePath is not empty
base: "/base",
filePath: "dir1/dir2",
expected: "/base/dir1/dir2",
},
{ // Test case 4: basePath is empty, filePath is not empty
base: "",
filePath: "dir1/dir2/dir3/raw.csv",
expected: "dir1/dir2/dir3/raw.csv",
},
{ // Test case 4: worng basePath, filePath file path
base: "/base//",
filePath: "//dir1/dir2/dir3/raw.csv",
expected: "/base/dir1/dir2/dir3/raw.csv",
},
}
for _, input := range inputs {
result := backup.azureBlobURL(input.base, input.filePath)
if result != input.expected {
t.Errorf("Expected %s, got %s", input.expected, result)
}
}
}
func TestRemove(t *testing.T) {
t.Skip()
ctx := context.Background()
backup := NewBackup("fakeAccount", "fakeAccountKey", "fakeContainer")
fakePath := "fakeDir1/fakeDir2/file.txt"
err := backup.remove(ctx, fakePath)
if err != nil {
t.Errorf("Expected no error, got %v", err.Error())
}
}
func TestSetTTL(t *testing.T) {
t.Skip()
ctx := context.Background()
backup := NewBackup("fakeAccount", "fakeAccountKey", "fakeContainer")
fakeFileUrl := "https://fakeAccount.blob.core.windows.net/fakeContainer/fakeDir1/fakeDir2/file.txt"
err := backup.setTTL(ctx, fakeFileUrl)
if err != nil {
t.Errorf("Expected no error, got %v", err.Error())
}
}
func TestGetContainerPath(t *testing.T) {
inputes := []struct {
containerName string
expected string
}{
{"container1", "https://fakeAccount.blob.core.windows.net/container1/"},
{"container2", "https://fakeAccount.blob.core.windows.net/container2/"},
{"container3", "https://fakeAccount.blob.core.windows.net/container3/"},
}
backup := NewBackup("fakeAccount", "fakeAccountKey", "fakeContainer")
for _, input := range inputes {
result := backup.getContainerPath(input.containerName)
if result != input.expected {
t.Errorf("Expected %v, got %v", input.expected, result)
}
}
}
func TestMove(t *testing.T) {
t.Skip()
ctx := context.Background()
backup := NewBackup("fakeAccount", "fakeAccountKey", "fakeContainer")
fakeFilePath := "fakeDir1/fakeDir2/file.txt"
err := backup.Move(ctx, fakeFilePath)
if err != nil {
t.Errorf("Expected no error, got %v", err.Error())
}
}
func TestChangeFileExt(t *testing.T) {
backup := NewBackup("fakeAccount", "fakeAccountKey", "fakeContainer")
inputs := []struct {
fileUrl string
ext string
expected string
}{
{"document.pdf", "txt", "document.txt"},
{"dir1/dir2/document.txt", "pdf", "dir1/dir2/document.pdf"},
{"", "pdf", ".pdf"},
{"document", "txt", "document.txt"},
{"document", ".txt", "document.txt"},
{"document.txt", ".pdf", "document.pdf"},
{"https://fakeAccount.blob.core.windows.net/fakeContainer/fakeVin/fakeVersion/yyyy/mm/dd/raw.csv", ".parquet", "https://fakeAccount.blob.core.windows.net/fakeContainer/fakeVin/fakeVersion/yyyy/mm/dd/raw.parquet"},
{"fakeVin/fakeVersion/yyyy/mm/dd/file.txt", "pdf", "fakeVin/fakeVersion/yyyy/mm/dd/file.pdf"},
}
for _, input := range inputs {
result := backup.changeFileExt(input.fileUrl, input.ext)
if result != input.expected {
t.Errorf("Expected %v, got %v", input.expected, result)
}
}
}
func TestToParquet(t *testing.T) {
t.Skip()
backup := NewBackup("fakeAccount", "fakeAccountKey", "fakeContainer")
fakeFilePath := []string{
"fakeVin1/fakeVersion1/yyyy/mm/dd/raw.csv",
"fakeVin2/fakeVersion2/yyyy/mm/dd/raw.csv",
"fakeVin3/fakeVersio3/yyyy/mm/dd/raw.csv",
}
var wg sync.WaitGroup
for _, url := range fakeFilePath {
wg.Add(1)
go func(url string) {
defer wg.Done()
err := backup.ToParquet(url, guard)
if err != nil {
t.Errorf("Expected nil error, got %v", err.Error())
}
}(url)
}
wg.Wait()
}

View File

@@ -0,0 +1,259 @@
package remotefileupload
import (
"bytes"
"context"
"fmt"
"net/url"
"sync"
"time"
"fiskerinc.com/modules/logger"
"fiskerinc.com/modules/utils/envtool"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/policy"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/appendblob"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/bloberror"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/sas"
"github.com/pkg/errors"
"fiskerinc.com/modules/utils/elptr"
)
// Comments on other versions I created
// Using a buffer instead of byte array, it would randomly drop one number
// was doing buffer.Write(separator), buffer.Write(x) and I would get something like 4,3,,8,1
// Using a rwmutex on map, and a mutex on []byte, read to check out the item, then write if we where assigning
// couldn't get it to work. Would drop numbers as well
// Using the sync on the inner string proves to be slightly more performant than locking the whole thing
var (
RunBatchTimer = true // For local testing, if you don't want to upload to azure, set to false
ConnectToAzBlob = true // For local testing, set to false to not use credentials
batchMaxSize = envtool.GetEnvInt("AZURE_STORAGE_BATCH_UPLOAD_MAX_MIB", 2)
)
func NewAzureBatchUploader(azureStorageContainerName string, azureFileExtension string, messageBatchTimeSeconds int, batchSeparator string) (Uploader, error) {
a := &AzureBatchUploader{
accountName: azureAccount,
containerName: azureStorageContainerName,
fileExtension: azureFileExtension,
}
var cred *azblob.SharedKeyCredential
var err error
if ConnectToAzBlob {
cred, err = azblob.NewSharedKeyCredential(a.accountName, azureAccountKey)
if err != nil {
return a, errors.WithStack(err)
}
}
containerPath := fmt.Sprintf("https://%s.blob.core.windows.net/%s/", a.accountName, a.containerName)
a.containerPath = containerPath
a.azureCredentials = cred
a.separator = []byte(batchSeparator)
a.logsToSend = &logsMapMutex{logs: map[string]*stringMutex{},
Mutex: sync.Mutex{}}
if RunBatchTimer {
a.batchTicker = time.NewTicker(time.Duration(messageBatchTimeSeconds) * time.Second)
go func() {
for {
select {
case <-a.batchTicker.C:
a.uploadNow()
}
}
}()
}
return a, nil
}
// So the creation of logs to send
type AzureBatchUploader struct {
accountName string
containerName string
fileExtension string
containerPath string
azureCredentials *azblob.SharedKeyCredential
logsToSend *logsMapMutex // A map of strings
separator []byte
batchTicker *time.Ticker
}
// Ideally we lock the map on when we have to change insert a new value, otherwise
// we rely on the string itself to lock
type logsMapMutex struct {
logs map[string]*stringMutex
sync.Mutex // Not really the mutex I want, will probably swap for the sync.Map later
}
// Removing mutex, can't easily guarantee that the outside map wo't change as we are trying to modify our
// string, without the mutex just becoming repetitive
type stringMutex struct {
Body []byte
logValue LogPayload
sasToken string
}
// Instead of directly uploading, we need to delay our upload
func (a *AzureBatchUploader) Upload(block []byte, logValue LogPayload, filePath ...string) (string, error) {
blobURL := a.azureBlobURL(a.containerPath, filePath)
// Lock a.logs
a.logsToSend.Mutex.Lock()
sendMap := a.logsToSend
defer sendMap.Unlock()
// a.logs cant' get changed
mstring, ok := sendMap.logs[blobURL]
if !ok {
var sasTokenURL string
// If we don't have the log inside, we likely need to generate the file, and the sas token. Should implement some sort of caching for this though
if ConnectToAzBlob {
blobPath := a.azureBlobFilePath(filePath)
var err error
sasTokenURL, err = a.generateSASToken(blobPath)
if err != nil {
logger.Error().Str(logValue.Title, logValue.Value).Err(err).Send()
return "", err
}
}
sendMap.logs[blobURL] = &stringMutex{
Body: block,
logValue: logValue,
sasToken: sasTokenURL,
}
} else {
// Tried using a buffer, but did not seem to improve performance
mstring.Body = append(append(mstring.Body, a.separator...), block...)
}
// To prevent us from taking up too much memory, we will send out data early
// This will slow down other logs coming in, so do not make this value too small
if len(sendMap.logs[blobURL].Body) > 1024*1024*batchMaxSize {
a.uploadBlock(blobURL, sendMap.logs[blobURL], context.Background())
sendMap.logs[blobURL].Body = make([]byte, 0)
}
blobURL = blobURL + "?" + sendMap.logs[blobURL].sasToken
return blobURL, nil
}
// Called once the batch has been expired, we actually upload. Can probably just call the azure upload service itself,
// but refactor later
func (a *AzureBatchUploader) uploadNow() {
// Swap in new log holder
// Acquire a lock on a.logs. Now we will wait until other writes are finished, but someone could lock behind us
a.logsToSend.Lock()
messageMap := a.logsToSend
// No longer swap the whole object, just replace the map. Will create a backup for the upload time unfortunately
// Unlocking
defer messageMap.Unlock()
ctx := context.Background()
for blobURL, block := range messageMap.logs {
a.uploadBlock(blobURL, block, ctx)
}
// While we still have a lock on the map, we swap it out
messageMap.logs = make(map[string]*stringMutex)
}
func (a *AzureBatchUploader) uploadBlock(blobURL string, block *stringMutex, ctx context.Context) (err error) {
client, err := appendblob.NewClientWithSharedKeyCredential(blobURL, a.azureCredentials, &appendblob.ClientOptions{
ClientOptions: policy.ClientOptions{
Retry: policy.RetryOptions{
MaxRetries: 1,
MaxRetryDelay: 1 * time.Minute,
},
},
})
if err != nil {
logger.Error().Str("Message", "Failed to create new client with shared key credential").Err(err).Send()
return
}
logValue := block.logValue
logger.Debug().Str(logValue.Title, logValue.Value).Msgf("sending block of length %d to azure container: %s", len(block.Body), blobURL)
block.Body = append(block.Body, a.separator...)
// Instead of trying to send data to a blob, and then determining if it exists, lets just check if it exists
_, err = client.GetProperties(ctx, nil)
if err != nil {
if !bloberror.HasCode(err, bloberror.BlobNotFound) {
logger.Error().Str(logValue.Title, logValue.Value).Err(err).Send()
return
}
_, err = client.Create(ctx, nil)
if err != nil {
logger.Error().Str(logValue.Title, logValue.Value).Err(err).Send()
return
}
}
body := block.Body
// 2014*1024*4 == 4 MiB,
MiB4 := 1024 * 1024 * 4
for len(body) > 0 {
max := MiB4
if len(body) < max {
max = len(body)
}
reader := NopCloser(bytes.NewReader(body[0:max]))
_, err = client.AppendBlock(ctx, reader, nil)
if err != nil {
logger.Error().Str(logValue.Title, logValue.Value).Err(err).Msgf("Max: %d, len(body): %d", max, len(body))
return
}
body = body[max:]
logger.Debug().Str(logValue.Title, logValue.Value).Msgf("upload complete")
}
return
}
func (a *AzureBatchUploader) azureBlobFilePath(filepath []string) string {
fileName := fmt.Sprintf("%s%s", "raw", a.fileExtension)
finalPath, _ := url.JoinPath("", filepath...)
finalPath, _ = url.JoinPath(finalPath, fileName)
return finalPath
}
// basePath is the url to the blob storage (<account>.azurebloburl.net/<containername>)
// filepath will be added onto basepath /<your>/<file>/<path>
func (a *AzureBatchUploader) azureBlobURL(basePath string, filePath []string) string {
fileName := fmt.Sprintf("%s%s", "raw", a.fileExtension)
finalPath, _ := url.JoinPath(basePath, filePath...)
finalPath, _ = url.JoinPath(finalPath, fileName)
return finalPath
}
func (a *AzureBatchUploader) generateSASToken(blobName string) (token string, err error) {
// blob name is something like this: 19UUA56873A044568/2023/01/11/raw.log
sasQueryParams, err := sas.BlobSignatureValues{
Protocol: sas.ProtocolHTTPS,
StartTime: time.Now().UTC().Add(-1 * time.Hour), // reduce an hour from current time to avoid signature issue
ExpiryTime: time.Now().UTC().Add(3 * 365 * 24 * time.Hour), // 3 years-ish
Permissions: elptr.ElPtr(sas.BlobPermissions{Read: true}).String(),
ContainerName: a.containerName,
BlobName: blobName,
}.SignWithSharedKey(a.azureCredentials)
if err != nil {
logger.Error().Err(err).Msg("Failed to sas.BlobSignatureValues")
return
}
token = sasQueryParams.Encode()
return
}
/* func MutexLocked(m *sync.Mutex) bool {
state := reflect.ValueOf(m).Elem().FieldByName("state")
const mutexLocked int64 = 1
return state.Int()&mutexLocked == mutexLocked
} */

View File

@@ -0,0 +1,407 @@
package remotefileupload
import (
"bytes"
"context"
"encoding/gob"
"fmt"
"math/rand"
"sort"
"strconv"
"strings"
"sync"
"testing"
"time"
)
func TestBlockUpload(t *testing.T) {
t.Skip()
azureAccount = "REPLACE_ME"
azureAccountKey = "REPLACE_ME"
RunBatchTimer = false
uploader, err := NewAzureBatchUploader("trex-logs", ".txt", 30, ",")
if err != nil {
t.Error(err)
}
p, ok := uploader.(*AzureBatchUploader)
if !ok {
t.Error("Could not convert uploader to azure batch uploader")
}
typedUploader := *p
filePath := typedUploader.azureBlobURL(typedUploader.containerPath, []string{"4mibUpload"})
fakeFile := stringMutex{
Body: []byte{},
logValue: LogPayload{Title: "4mibTetFile", Value: "4mibUpload"},
}
// Making it 5 Mibs
for x := 0; len(fakeFile.Body) < 1024*1024*5; x++ {
fakeFile.Body = append(fakeFile.Body, []byte(fmt.Sprintf("%d,", x))...)
}
err = typedUploader.uploadBlock(filePath, &fakeFile, context.Background())
if err != nil {
t.Error(err)
}
}
func TestBlockUploadCheckPath(t *testing.T) {
t.Skip()
azureAccount = "REPLACE_ME"
azureAccountKey = "REPLACE_ME"
uploader, err := NewAzureBatchUploader("trex-logs", ".txt", 30, ",")
if err != nil {
t.Error(err)
return
}
logP := LogPayload{
Title: "Test",
Value: "Value",
}
path, err := uploader.Upload([]byte("Hello This is a file path test"), logP, "/file", "test")
if err != nil{
t.Error(err)
return
}
p, ok := uploader.(*AzureBatchUploader)
if !ok {
t.Error("Could not convert uploader to azure batch uploader")
return
}
typedUploader := *p
typedUploader.uploadNow()
// Going to this path should give you the file
t.Log(path)
}
// Adds the same number to all threads in a goroutine
func TestMutexValues(t *testing.T) {
a := AzureBatchUploader{
accountName: "fakeName",
containerName: "fakeContainer",
fileExtension: ".txt",
separator: []byte{','},
}
ConnectToAzBlob = false
a.logsToSend = &logsMapMutex{logs: map[string]*stringMutex{},
Mutex: sync.Mutex{}}
logP := LogPayload{
Title: "Test",
Value: "Value",
}
gr := sync.WaitGroup{}
gr.Add(100)
for x := 0; x < 100; x++ {
//t.Logf("Number is %d\n", x)
go func(y int) {
for z := 0; z < 100; z++ {
_, _ = a.Upload([]byte(strconv.Itoa(y)), logP, fmt.Sprintf("/file/test%d", z))
}
gr.Done()
}(x)
}
gr.Wait()
sendMap := *(a.logsToSend)
for x := 0; x < 100; x++ {
filePath := fmt.Sprintf("file/test%d/raw.txt", x)
mstring, ok := sendMap.logs[filePath]
if !ok {
t.Fail()
}
t.Log(string(mstring.Body))
if !checkNumberString(string(mstring.Body), 100, t) {
t.Fail()
}
}
}
// Adds all numbers to one thread in a goroutine
func TestMutexValuesOrderSwap(t *testing.T) {
a := AzureBatchUploader{
accountName: "fakeName",
containerName: "fakeContainer",
fileExtension: ".txt",
separator: []byte{','},
}
a.logsToSend = &logsMapMutex{logs: map[string]*stringMutex{},
Mutex: sync.Mutex{}}
logP := LogPayload{
Title: "Test",
Value: "Value",
}
gr := sync.WaitGroup{}
gr.Add(100)
for x := 0; x < 100; x++ {
//t.Logf("Number is %d\n", x)
go func(y int) {
for z := 0; z < 100; z++ {
_, _ = a.Upload([]byte(strconv.Itoa(z)), logP, fmt.Sprintf("/file/test%d", y))
}
gr.Done()
}(x)
}
gr.Wait()
sendMap := *(a.logsToSend)
for x := 0; x < 100; x++ {
filePath := fmt.Sprintf("file/test%d/raw.txt", x)
mstring, ok := sendMap.logs[filePath]
if !ok {
t.Fail()
}
t.Log(string(mstring.Body))
if !checkNumberString(string(mstring.Body), 100, t) {
t.Fail()
}
}
}
func TestDoesMapSwap(t *testing.T) {
t.Skip()
azureAccount = "REPLACE_ME"
azureAccountKey = "REPLACE_ME"
RunBatchTimer = false
up, err := NewAzureBatchUploader("trex-logs", ".log", 60, "\n")
if err != nil {
t.Error(err)
}
p, ok := up.(*AzureBatchUploader)
if !ok {
t.Error("Could not convert uploader to azure batch uploader")
}
a := *p
logP := LogPayload{
Title: "Test",
Value: "Value",
}
wg := sync.WaitGroup{}
wg.Add(99)
a.Upload([]byte(strconv.Itoa(0)), logP, "/file/test")
for x := 1; x < 100; x++ {
//t.Logf("Number is %d\n", x)
go func(x int) {
time.Sleep(time.Millisecond * time.Duration(rand.Int63n(10)))
a.Upload([]byte(strconv.Itoa(x)), logP, "/file/test")
wg.Done()
}(x)
}
time.Sleep(5 * time.Millisecond)
a.uploadNow()
wg.Wait()
a.uploadNow()
}
// Writes is the number of numbers to write, threads is how many cars are sending in data
func benchmarkMutex1(writes, threads int, b *testing.B) {
a := AzureBatchUploader{
accountName: "fakeName",
containerName: "fakeContainer",
fileExtension: ".txt",
separator: []byte{','},
}
a.logsToSend = &logsMapMutex{logs: map[string]*stringMutex{},
Mutex: sync.Mutex{}}
logP := LogPayload{
Title: "Test",
Value: "Value",
}
gr := sync.WaitGroup{}
gr.Add(threads)
for x := 0; x < threads; x++ {
//t.Logf("Number is %d\n", x)
go func(y int) {
for z := 0; z < writes; z++ {
p, _ := a.Upload([]byte(strconv.Itoa(z)), logP, fmt.Sprintf("/file/test%d", y))
_ = p
}
gr.Done()
}(x)
}
gr.Wait()
}
func BenchmarkMutex1w100t100(b *testing.B) {
benchmarkMutex1(100, 100, b)
}
func BenchmarkMutex1w100t1000(b *testing.B) {
benchmarkMutex1(100, 1000, b)
}
func BenchmarkMutex1w100t10000(b *testing.B) {
benchmarkMutex1(100, 10000, b)
}
// BenchmarkMutex1w100t100000-16 1000000000 0.5827 ns/op 0 B/op 0 allocs/op
func BenchmarkMutex1w100t100000(b *testing.B) {
benchmarkMutex1(100, 10000, b)
}
// BenchmarkMutex1w1000t100000-16 1 4727771437 ns/op 8845213144 B/op 208939787 allocs/op
// BenchmarkMutex1w1000t100000-16 1 6521959624 ns/op 9321626472 B/op 228890209 allocs/op
func BenchmarkMutex1w1000t100000(b *testing.B) {
benchmarkMutex1(1000, 10000, b)
}
func checkNumberString(str string, max int, t *testing.T) (success bool) {
var err error
stringNumbers := strings.Split(str, ",")
numbers := make([]int, len(stringNumbers))
for x, num := range stringNumbers {
numbers[x], err = strconv.Atoi(num)
if err != nil {
t.Error(err)
t.FailNow()
}
}
sort.Ints(numbers)
for x := 0; x < max; x++ {
if numbers[x] != x {
return false
}
}
return true
}
// inclusive start, exclusive end
func checkNumberStringRange(str string, start int, end int, t *testing.T) (success bool) {
var err error
stringNumbers := strings.Split(str, ",")
numbers := make([]int, len(stringNumbers))
for x, num := range stringNumbers {
numbers[x], err = strconv.Atoi(num)
if err != nil {
t.Error(err)
t.FailNow()
}
}
sort.Ints(numbers)
x := start
for _, num := range numbers {
if num != x {
return false
}
x++
}
return true
}
// This uses only ~38.439708 megabytes, in actual log sizing
func BenchmarkTotalSize(b *testing.B) {
// This is a longish message
testMSG := `{"level":"error","timestamp":"2022-Nov-30 22:17:26.250332","line_number":0,"filename":"dummy","msg":"ws_handshake: The WebSocket handshake was declined by the remote peer"}`
RunBatchTimer = false
ConnectToAzBlob = false
uploader, err := NewAzureBatchUploader("fakeName", ".txt", 30, "\n")
if err != nil {
b.Error(err)
}
p, ok := uploader.(*AzureBatchUploader)
if !ok {
b.Error("Could not convert uploader to azure batch uploader")
}
typedUploader := *p
// Simulate getting a message over 1 minute every 15 seconds from 50,000 cars
for x := 0; x <= 50000; x++ {
pl := LogPayload{
Title: "VIN",
Value: fmt.Sprintf("VINNUMBER%d", x),
}
_, err = typedUploader.Upload([]byte(testMSG), pl, pl.Value)
if err != nil {
b.Error(err)
}
_, err = typedUploader.Upload([]byte(testMSG), pl, pl.Value)
if err != nil {
b.Error(err)
}
_, err = typedUploader.Upload([]byte(testMSG), pl, pl.Value)
if err != nil {
b.Error(err)
}
_, err = typedUploader.Upload([]byte(testMSG), pl, pl.Value)
if err != nil {
b.Error(err)
}
}
b.Log(getRealSizeOf(typedUploader.logsToSend.logs))
}
func getRealSizeOf(v interface{}) (int, error) {
b := new(bytes.Buffer)
if err := gob.NewEncoder(b).Encode(v); err != nil {
return 0, err
}
return b.Len(), nil
}
func TestOrder(t *testing.T){
RunBatchTimer = false
ConnectToAzBlob = false
uploader, err := NewAzureBatchUploader("fakeName", ".txt", 30, "\n")
p, ok := uploader.(*AzureBatchUploader)
if !ok {
t.Error("Could not convert uploader to azure batch uploader")
}
typedUploader := *p
if err != nil {
t.Error(err)
}
pl := LogPayload{
Title: "VIN",
Value: "SomeFakeVin",
}
for x := 0; x < 1000; x ++{
_, err := typedUploader.Upload([]byte(fmt.Sprint(x)), pl, pl.Value)
if err != nil {
t.Error(err)
t.FailNow()
}
}
val := typedUploader.logsToSend.logs["https://REPLACE_ME.blob.core.windows.net/fakeName/SomeFakeVin/raw.txt"]
numberArray := strings.Split(string(val.Body), "\n")
for x := 0; x < len(numberArray) - 1; x ++ {
a, _ := strconv.Atoi(numberArray[x])
b, _ := strconv.Atoi(numberArray[x+1])
if a + 1 != b {
t.Logf("Failed got %s before %s\n", numberArray[x], numberArray[x+1])
t.Fail()
}
}
}

View File

@@ -0,0 +1,140 @@
package remotefileupload
import (
"bufio"
"fmt"
"io"
"strconv"
"strings"
"fiskerinc.com/modules/utils/envtool"
)
var (
parquetFileSizeIdeal = int64(envtool.GetEnvInt("PARQUET_FILE_SIZE_IN_COMPRESSED", 1024*1024*200))
skipAzure = false
)
type ICSVtoParquet interface {
Read(io.Reader) error
Write() error
}
type csvToParquet struct {
azureAccount string
azureAccountKey string
queue chan string
parquetBlobPath string
counter int
}
func NewCSVtoParquet(azureAccount, azureAccountKey, parquetBlobUrl string) ICSVtoParquet {
return &csvToParquet{
azureAccount: azureAccount,
azureAccountKey: azureAccountKey,
queue: make(chan string, 20),
parquetBlobPath: getPathFromURL(parquetBlobUrl),
}
}
// Read reads lines from the provided io.Reader and sends them to a buffered channel.
// The function uses a bufio.Reader to efficiently read lines until it encounters an EOF (end of file).
// Each read line is sent to a pre-initialized buffered channel 'queue' for further processing.
// The channel is closed once all lines are read or if an error occurs during the process.
//
// Parameters:
//
// reader (io.Reader): The input stream from which lines are read.
//
// Returns:
//
// error: If an error occurs during reading, it is returned. Otherwise, returns nil.
func (cp *csvToParquet) Read(reader io.Reader) error {
defer close(cp.queue) // Close the channel when file done.
bio := bufio.NewReader(reader)
for {
line, err := bio.ReadString('\n')
if err == io.EOF {
break
}
if err != nil {
return err
}
cp.queue <- line
}
return nil
}
func (cp *csvToParquet) newWriter() (ParquetBlobWriter, error) {
if skipAzure {
cp.generateFile()
return NewFakeAzureParquetBlobWriter()
}
return NewAzureParquetBlobWriter(cp.generateFile(), cp.azureAccount, cp.azureAccountKey)
}
func (cp *csvToParquet) Write() error {
var writer ParquetBlobWriter
var err error
writer, err = cp.newWriter()
if err != nil {
return err
}
defer func() {
writer.Close()
}()
for line := range cp.queue {
splitedRaw := strings.Split(line, ",")
if len(splitedRaw) < 3 {
continue
}
timeStamp, _ := strconv.ParseInt(splitedRaw[0], 10, 64)
idAs64, _ := strconv.ParseInt(splitedRaw[1], 10, 32)
id := int32(idAs64)
payload := ParquetCANMessage{
TimestampUSec: &timeStamp,
ID: &id,
Data: &splitedRaw[2],
}
err = writer.Write(payload)
if err != nil {
return err
}
// if size is greater to 200MB, start writing in new file to avoid memory issue
if writer.Size() >= parquetFileSizeIdeal {
writer.Close()
writer, err = cp.newWriter()
if err != nil {
return err
}
}
}
return nil
}
func (cp *csvToParquet) generateFile() string {
file := fmt.Sprintf("%v/%v-%d.parquet", cp.parquetBlobPath, "raw", cp.counter)
cp.counter++
return file
}
// getPathFromURL takes a file path as input and returns the path without the file name.
// It splits the input path using "/" as the separator, removes the last element (file name),
// and then joins the remaining elements back together with "/" as the separator.
// If the input is an empty string or contains only the root directory, the function returns an empty string.
//
// Parameters:
//
// file (string): The input file path from which to extract the directory path.
//
// Returns:
//
// string: The directory path without the file name.
func getPathFromURL(file string) string {
splitPath := strings.Split(file, "/")
splitPath = splitPath[:len(splitPath)-1]
return strings.Join(splitPath, "/")
}

View File

@@ -0,0 +1,161 @@
package remotefileupload
import (
"bufio"
"bytes"
"log"
"reflect"
"sync"
"testing"
)
var content = `1691443566877007,816,AAAA+gAAAAA=
1691443566877013,801,AAAAAAABAAA=
1691443566877019,835,AAAEAAAAAAA=
1691443566877059,1410,AAAkAAAAAAA=
1691443566877064,821,AAAAAAgAAAA=
1691443566877069,1304,AgAAAAAAAAA=
1691443566877074,1298,BAAAAAAAAAA=
1691443566877078,902,AQAAAAAAAAA=
1691443566877082,1137,AAAAAAAAAgA=
1691443566877085,54,CAAAAAAAAAA=
1691443566877089,54,BAAAAAAAAAA=
1691443566877093,1329,AAAAYagAAAA=
1691443566877096,608,YAAAAAAAAAA=
1691443566877100,1297,AIAAAAAAAAA=
1691443566877118,1268,AAAAADwAAAA=
1691443566877122,757,AAAAAAAeAAA=
1691443566877126,882,AAAAAABOAAA=
1691443566877143,1284,AAAAAAAH0AA=
1691443566877147,1285,AAAAAAAAgAA=
1691443566877167,1408,AAAAAAAtAAA=
1691443566877173,1584,AAAAAAAAAC0=
1691443566877512,873,AAAAAMgAAAA=
1691443567878825,1317,AAA+pngR/pc=
1691443567878850,816,AAAA4QAAAAA=
`
var contentArray = []string{
"1691443566877007,816,AAAA+gAAAAA=\n",
"1691443566877013,801,AAAAAAABAAA=\n",
"1691443566877019,835,AAAEAAAAAAA=\n",
"1691443566877059,1410,AAAkAAAAAAA=\n",
"1691443566877064,821,AAAAAAgAAAA=\n",
"1691443566877069,1304,AgAAAAAAAAA=\n",
"1691443566877074,1298,BAAAAAAAAAA=\n",
"1691443566877078,902,AQAAAAAAAAA=\n",
"1691443566877082,1137,AAAAAAAAAgA=\n",
"1691443566877085,54,CAAAAAAAAAA=\n",
"1691443566877089,54,BAAAAAAAAAA=\n",
"1691443566877093,1329,AAAAYagAAAA=\n",
"1691443566877096,608,YAAAAAAAAAA=\n",
"1691443566877100,1297,AIAAAAAAAAA=\n",
"1691443566877118,1268,AAAAADwAAAA=\n",
"1691443566877122,757,AAAAAAAeAAA=\n",
"1691443566877126,882,AAAAAABOAAA=\n",
"1691443566877143,1284,AAAAAAAH0AA=\n",
"1691443566877147,1285,AAAAAAAAgAA=\n",
"1691443566877167,1408,AAAAAAAtAAA=\n",
"1691443566877173,1584,AAAAAAAAAC0=\n",
"1691443566877512,873,AAAAAMgAAAA=\n",
"1691443567878825,1317,AAA+pngR/pc=\n",
"1691443567878850,816,AAAA4QAAAAA=\n",
}
func TestRead(t *testing.T) {
tests := []struct {
name string
input string
expectedData []string
expectedErr error
}{
{
name: "ReadLinesSuccessfully",
input: content,
expectedData: contentArray,
expectedErr: nil,
},
{
name: "EmptyInput",
input: "",
expectedData: []string{},
expectedErr: nil,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
cv := &csvToParquet{
queue: make(chan string, 40),
}
reader := bufio.NewReader(bytes.NewBufferString(test.input))
err := cv.Read(reader)
var result []string
for item := range cv.queue {
result = append(result, item)
}
if !reflect.DeepEqual(result, test.expectedData) && !(len(result) == 0 && len(test.expectedData) == 0) {
t.Errorf("For test '%s', expected queue %v, but got %v", test.name, test.expectedData, result)
}
if !errorsEqual(err, test.expectedErr) {
t.Errorf("For test '%s', expected error '%v', but got '%v'", test.name, test.expectedErr, err)
}
})
}
}
func errorsEqual(err1, err2 error) bool {
if err1 == nil && err2 == nil {
return true
}
if err1 == nil || err2 == nil {
return false
}
return err1.Error() == err2.Error()
}
func TestGetPathFromURL(t *testing.T) {
tests := []struct {
input string
expected string
}{
{"path/to/file.txt", "path/to"},
{"another/path/to/image.jpg", "another/path/to"},
{"root", ""},
{"", ""},
{"/absolute/path/file.txt", "/absolute/path"},
}
for _, test := range tests {
result := getPathFromURL(test.input)
if result != test.expected {
t.Errorf("For input %s, expected %s, but got %s", test.input, test.expected, result)
}
}
}
func BenchmarkReadWrite(b *testing.B) {
skipAzure = true
parquetFileSizeIdeal = 100
reader := bufio.NewReader(bytes.NewBufferString(content))
for i := 0; i < b.N; i++ {
csvToParquet := NewCSVtoParquet("", "", "https://yourstorageaccount.blob.core.windows.net/raw.csv")
go csvToParquet.Read(reader)
var wg sync.WaitGroup
wg.Add(1)
go func(w *sync.WaitGroup) {
err := csvToParquet.Write()
if err != nil {
log.Println(err)
}
w.Done()
}(&wg)
wg.Wait()
}
}

View File

@@ -0,0 +1,5 @@
package remotefileupload
import "github.com/pkg/errors"
var ErrInvalidUploader = errors.New("invalid uploader type")

View File

@@ -0,0 +1,95 @@
package remotefileupload
import (
"context"
"fiskerinc.com/modules/logger"
az "github.com/Azure/azure-storage-blob-go/azblob"
pqAZ "github.com/xitongsys/parquet-go-source/azblob"
"github.com/xitongsys/parquet-go/source"
"github.com/xitongsys/parquet-go/writer"
)
var (
parquetThreadCount int64 = 4
)
var (
errOnCloseWriter = "Unable to close writer"
)
// Required struct to intake compressed parquet files which lists fields as optional
//
// hence the pointers to int,string
type ParquetCANMessage struct {
TimestampUSec *int64 `json:"epoch_usec" parquet:"name=epoch_usec, type=INT64"`
ID *int32 `json:"id" parquet:"name=id, type=INT32"`
Data *string `json:"data" parquet:"name=data, type=BYTE_ARRAY"`
}
// NewAzureParquetBlobWriter creates a new instance of ParquetBlobWriter that can be used to write Parquet files to Azure Blob Storage.
//
// Parameters:
// - blobUrl: The URL of the Azure Blob Storage container where the Parquet files will be stored.
//
// Returns:
// - ParquetBlobWriter: An instance of ParquetBlobWriter.
// - error: An error if there was a problem creating the writer.
func NewAzureParquetBlobWriter(blobUrl, azureAccount, azureAccountKey string) (ParquetBlobWriter, error) {
creds, err := az.NewSharedKeyCredential(azureAccount, azureAccountKey)
if err != nil {
return nil, err
}
fr, err := pqAZ.NewAzBlobFileWriter(
context.Background(),
blobUrl,
creds,
pqAZ.WriterOptions{},
)
if err != nil {
return nil, err
}
pr, err := writer.NewParquetWriter(fr, new(ParquetCANMessage), parquetThreadCount)
if err != nil {
return nil, err
}
return &AzureParquetBlobWriter{blob: fr, fileWriter: pr}, nil
}
type ParquetBlobWriter interface {
Write(payload interface{}) error
Size() int64
Close()
}
type AzureParquetBlobWriter struct {
blob source.ParquetFile
fileWriter *writer.ParquetWriter
}
func (w *AzureParquetBlobWriter) Write(payload interface{}) error {
err := w.fileWriter.Write(payload)
if err != nil {
return err
}
return nil
}
func (w *AzureParquetBlobWriter) Size() int64 {
return w.fileWriter.Size
}
func (w *AzureParquetBlobWriter) Close() {
err := w.fileWriter.WriteStop()
if err != nil {
logger.Debug().Msgf("%v: %s", err, errOnCloseWriter)
}
err = w.blob.Close()
if err != nil {
logger.Debug().Msgf("%v:%s", err, errOnCloseWriter)
}
}

View File

@@ -0,0 +1,31 @@
package remotefileupload
import (
"unsafe"
)
func NewFakeAzureParquetBlobWriter() (ParquetBlobWriter, error) {
var data []interface{}
return &FakeAzureParquetBlobWriter{
data: data,
}, nil
}
type FakeAzureParquetBlobWriter struct {
data []interface{}
size int64
}
func (w *FakeAzureParquetBlobWriter) Write(payload interface{}) error {
w.size += int64(unsafe.Sizeof(payload))
w.data = append(w.data, payload)
return nil
}
func (w *FakeAzureParquetBlobWriter) Size() int64 {
return w.size
}
func (w *FakeAzureParquetBlobWriter) Close() {
return
}

View File

@@ -0,0 +1,19 @@
package remotefileupload
var (
AWSType string = "aws"
AzureType string = "azure"
AzureBatchType string = "azure_batch"
)
type Uploader interface {
// Upload filePath is the substring pieces of where you want the file stored: ex (dog,cat,mouse) => dog/cat/mouse
Upload(data []byte, logValue LogPayload, filePath ...string) (path string, err error)
}
type LogPayload struct {
Title string // When we log, this will be hey your {Title} {Value} errored
Value string
}
type uploaderFilePathBuild func()