--- title: "Using Your Own Data" author: "Olatunde D. Akanbi" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Using Your Own Data} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 6 ) ``` ```{r setup} library(manureshed) ``` ## Overview The `manureshed` package includes data from 1987-2016, but you can add your own: - **WWTP data** for years outside 2007-2016 - **Agricultural data** from other sources - **Custom spatial boundaries** - **Industrial or urban nutrient sources** ## Using Custom WWTP Data ### For Years Outside 2007-2016 The package has WWTP data for 2007-2016. For other years, provide your own: ```{r custom_wwtp_basic, eval=FALSE} # Use your WWTP files for 2020 results_2020 <- run_builtin_analysis( scale = "huc8", year = 2020, # Agricultural data available 1987-2016 nutrients = "nitrogen", include_wwtp = TRUE, custom_wwtp_nitrogen = "nitrogen_2021.csv", wwtp_load_units = "kg" ) # 3. Check results summarize_results(results_custom) quick_check(results_custom) # 4. Create maps nitrogen_map <- map_agricultural_classification( results_custom$integrated$nitrogen, "nitrogen", "combined_N_class", "2021 Nitrogen with Custom WWTP Data" ) save_plot(nitrogen_map, "custom_analysis_2021.png") # 5. Save results save_spatial_data( results_custom$integrated$nitrogen, "nitrogen_2021_results.rds" ) ``` This example shows the complete workflow from custom data to final maps and saved results. ## Advanced Custom Data Integration ### Adding Other Nutrient Sources You can integrate additional nutrient sources beyond WWTP: ```{r other_sources, eval=FALSE} # Example: Adding industrial sources industrial_data <- data.frame( Facility_Name = c("Steel Plant A", "Chemical Plant B", "Food Processor C"), Lat = c(41.5, 40.7, 42.1), Long = c(-81.7, -82.1, -83.5), N_Load_tons = c(50, 75, 30), P_Load_tons = c(10, 15, 8), Source_Type = "Industrial" ) # Convert to spatial format industrial_sf <- st_as_sf(industrial_data, coords = c("Long", "Lat"), crs = 4326) %>% st_transform(5070) # Transform to analysis CRS # Aggregate by spatial units (example for HUC8) boundaries <- load_builtin_boundaries("huc8") industrial_aggregated <- wwtp_aggregate_by_boundaries( industrial_sf, boundaries, "nitrogen", "huc8" ) # Add to existing results # (You would modify the integration functions to include industrial sources) ``` ### Working with Different Time Periods ```{r time_periods, eval=FALSE} # Create a time series dataset years_to_analyze <- 2018:2022 time_series_results <- list() for (year in years_to_analyze) { # Check if you have WWTP data for this year wwtp_file <- paste0("wwtp_nitrogen_", year, ".csv") if (file.exists(wwtp_file)) { time_series_results[[as.character(year)]] <- run_builtin_analysis( scale = "huc8", year = year, nutrients = "nitrogen", include_wwtp = TRUE, custom_wwtp_nitrogen = wwtp_file, save_outputs = FALSE, verbose = FALSE ) } else { # Agricultural only if no WWTP data time_series_results[[as.character(year)]] <- run_builtin_analysis( scale = "huc8", year = year, nutrients = "nitrogen", include_wwtp = FALSE, save_outputs = FALSE, verbose = FALSE ) } } # Compare results across years yearly_summary <- do.call(rbind, lapply(names(time_series_results), function(year) { result <- time_series_results[[year]] classes <- table(result$agricultural$N_class) data.frame( Year = as.numeric(year), Total_Units = sum(classes), Source_Units = classes["Source"] %||% 0, Sink_Units = sum(classes[c("Sink_Deficit", "Sink_Fertilizer")], na.rm = TRUE), Excluded_Units = classes["Excluded"] %||% 0 ) })) print(yearly_summary) ``` ### Custom Agricultural Data If you have agricultural data from other sources: ```{r custom_agricultural, eval=FALSE} # Example: Custom agricultural data format custom_farm_data <- data.frame( County_FIPS = c("39001", "39003", "39005"), Manure_N_kg = c(100000, 150000, 200000), Manure_P_kg = c(20000, 30000, 40000), Fertilizer_N_kg = c(50000, 75000, 100000), Fertilizer_P_kg = c(10000, 15000, 20000), N_Fixation_kg = c(25000, 40000, 35000), Crop_N_Removal_kg = c(80000, 120000, 140000), Crop_P_Removal_kg = c(15000, 25000, 30000), Cropland_acres = c(45000, 67000, 52000) ) # Convert to package format standardized_farm_data <- data.frame( ID = custom_farm_data$County_FIPS, NAME = paste("County", substr(custom_farm_data$County_FIPS, 3, 5)), manure_N = custom_farm_data$Manure_N_kg, manure_P = custom_farm_data$Manure_P_kg, fertilizer_N = custom_farm_data$Fertilizer_N_kg, fertilizer_P = custom_farm_data$Fertilizer_P_kg, N_fixation = custom_farm_data$N_Fixation_kg, crop_removal_N = custom_farm_data$Crop_N_Removal_kg, crop_removal_P = custom_farm_data$Crop_P_Removal_kg, cropland = custom_farm_data$Cropland_acres ) # Apply classifications custom_classified <- standardized_farm_data %>% agri_classify_nitrogen(cropland_threshold = 1234, scale = "county") %>% agri_classify_phosphorus(cropland_threshold = 1234, scale = "county") print(custom_classified) ``` ### Data Validation and Quality Control ```{r data_validation, eval=FALSE} # Function to validate your custom data validate_custom_data <- function(data, data_type = "wwtp") { issues <- list() if (data_type == "wwtp") { # Check required columns required_cols <- c("Facility_Name", "Lat", "Long") missing_cols <- setdiff(required_cols, names(data)) if (length(missing_cols) > 0) { issues$missing_columns <- missing_cols } # Check coordinate ranges (for US) if ("Lat" %in% names(data) && "Long" %in% names(data)) { invalid_coords <- sum(data$Lat < 24 | data$Lat > 50 | data$Long < -125 | data$Long > -66, na.rm = TRUE) if (invalid_coords > 0) { issues$invalid_coordinates <- invalid_coords } } # Check for negative loads load_cols <- names(data)[grepl("Load|load", names(data))] for (col in load_cols) { if (col %in% names(data)) { negative_loads <- sum(data[[col]] < 0, na.rm = TRUE) if (negative_loads > 0) { issues[[paste0("negative_", col)]] <- negative_loads } } } } if (data_type == "agricultural") { # Check for negative values in nutrient data nutrient_cols <- c("manure_N", "manure_P", "fertilizer_N", "fertilizer_P", "crop_removal_N", "crop_removal_P", "cropland") for (col in nutrient_cols) { if (col %in% names(data)) { negative_values <- sum(data[[col]] < 0, na.rm = TRUE) if (negative_values > 0) { issues[[paste0("negative_", col)]] <- negative_values } } } } if (length(issues) == 0) { message("✓ Data validation passed") } else { message("⚠ Data validation issues found:") for (issue in names(issues)) { message(" ", issue, ": ", issues[[issue]]) } } return(issues) } # Use the validation function # validate_custom_data(your_wwtp_data, "wwtp") # validate_custom_data(your_farm_data, "agricultural") ``` ### Exporting Results ```{r export_results, eval=FALSE} # Export results in different formats export_analysis_results <- function(results, output_dir = "exports") { dir.create(output_dir, showWarnings = FALSE) # Export agricultural data as CSV agri_csv <- st_drop_geometry(results$agricultural) write.csv(agri_csv, file.path(output_dir, "agricultural_results.csv"), row.names = FALSE) # Export as shapefile (if sf package available) if (requireNamespace("sf", quietly = TRUE)) { st_write(results$agricultural, file.path(output_dir, "agricultural_results.shp"), delete_dsn = TRUE, quiet = TRUE) } # Export WWTP facilities if available if ("wwtp" %in% names(results)) { for (nutrient in names(results$wwtp)) { facility_data <- results$wwtp[[nutrient]]$facility_data write.csv(facility_data, file.path(output_dir, paste0("wwtp_", nutrient, "_facilities.csv")), row.names = FALSE) } } # Export integrated results if available if ("integrated" %in% names(results)) { for (nutrient in names(results$integrated)) { integrated_csv <- st_drop_geometry(results$integrated[[nutrient]]) write.csv(integrated_csv, file.path(output_dir, paste0("integrated_", nutrient, "_results.csv")), row.names = FALSE) } } message("Results exported to: ", output_dir) } # Use the export function # export_analysis_results(results_custom, "my_exports") ``` ## Troubleshooting Common Issues ### WWTP Data Issues ```{r troubleshooting_wwtp, eval=FALSE} # Common WWTP data problems and solutions # Problem 1: "No valid facilities remaining after cleaning" # Solution: Check coordinates and load values wwtp_data <- read.csv("your_wwtp_file.csv") summary(wwtp_data) # Look for obvious issues # Check coordinate ranges summary(wwtp_data$Latitude) # Should be ~24-50 for US summary(wwtp_data$Longitude) # Should be ~-125 to -66 for US # Check load values summary(wwtp_data$Load) # Should be positive numbers # Problem 2: Wrong coordinate system # Solution: Make sure coordinates are in decimal degrees (not UTM) # Example conversion if needed: # library(sf) # points_utm <- st_as_sf(data, coords = c("X_UTM", "Y_UTM"), crs = 32617) # points_dd <- st_transform(points_utm, 4326) # coords_dd <- st_coordinates(points_dd) # Problem 3: Mixed units in same file # Solution: Standardize units before analysis standardize_mixed_units <- function(data, load_col, unit_col) { data$standardized_load <- ifelse( data[[unit_col]] == "kg", data[[load_col]], ifelse(data[[unit_col]] == "lbs", data[[load_col]] / 2.20462, data[[load_col]] * 907.185) # assuming tons ) return(data) } ``` ### Agricultural Data Issues ```{r troubleshooting_agricultural, eval=FALSE} # Common agricultural data problems # Problem: Impossible nutrient balances # Solution: Check your units and conversion factors check_nutrient_balance <- function(data) { # Calculate simple checks data$total_inputs_N <- data$manure_N + data$fertilizer_N + data$N_fixation data$efficiency_N <- data$crop_removal_N / data$total_inputs_N # Flag suspicious values suspicious <- data$efficiency_N > 2.0 | data$efficiency_N < 0.1 if (sum(suspicious, na.rm = TRUE) > 0) { message("Warning: ", sum(suspicious, na.rm = TRUE), " units have suspicious N efficiency") print(data[suspicious, c("ID", "total_inputs_N", "crop_removal_N", "efficiency_N")]) } return(data) } # Problem: Missing spatial boundaries # Solution: Use built-in boundaries or provide your own # county_boundaries <- load_builtin_boundaries("county") # your_data_with_boundaries <- merge(your_data, county_boundaries, by.x = "FIPS", by.y = "FIPS") ``` ## Getting Help ### Common Questions **Q: What format should my WWTP data be in?** A: CSV file with columns for facility name, latitude, longitude, annual load, and state. The package can auto-detect most EPA formats. **Q: What units can I use for loads?** A: "kg", "lbs", "pounds", or "tons". Specify with `wwtp_load_units`. **Q: Can I use data from other countries?** A: The spatial boundaries are US-only, but you could provide your own boundaries and modify the coordinate system. **Q: How do I handle missing data?** A: The package excludes facilities with missing coordinates or loads. Clean your data first for best results. **Q: My analysis is running slowly. What can I do?** A: Try using a smaller spatial scale (HUC2 < HUC8 < County in terms of processing time), fewer years, or clear the data cache with `clear_data_cache()`. **Q: How do I cite the package and data?** A: Use `citation_info()` to get proper citations for the package and underlying datasets. ### Function Help ```{r help, eval=FALSE} # Get help on specific functions ?load_user_wwtp ?run_builtin_analysis ?wwtp_clean_data # Check what data is available check_builtin_data() list_available_years() # Package health check health_check() # Test data download connection test_osf_connection() ``` ### Getting More Help - Check the other vignettes: `vignette("getting-started")`, `vignette("visualization-guide")`, `vignette("advanced-features")` - Look at function documentation: `?function_name` - Check the package website or GitHub repository for examples - Use `quick_check()` to validate your results The `manureshed` package is designed to work with your data as easily as possible. Start with the built-in examples, then gradually add your own data as needed.nitrogen_2020.csv" ) ### Handling Different Data Formats #### Standard EPA Format Most EPA WWTP files work automatically: ```{r epa_standard, eval=FALSE} # Standard EPA format (auto-detected) results <- run_builtin_analysis( scale = "county", year = 2018, nutrients = c("nitrogen", "phosphorus"), include_wwtp = TRUE, custom_wwtp_nitrogen = "nitrogen_2018.csv", custom_wwtp_phosphorus = "phosphorus_2018.csv" ) ``` #### Different Units ```{r different_units, eval=FALSE} # If your data uses pounds instead of kg results <- run_builtin_analysis( scale = "huc8", year = 2019, nutrients = "nitrogen", include_wwtp = TRUE, custom_wwtp_nitrogen = "nitrogen_pounds_2019.csv", wwtp_load_units = "lbs" # Converts automatically ) # Other units: "kg", "lbs", "pounds", "tons" ``` #### Different File Format ```{r different_format, eval=FALSE} # If headers are in different rows results <- run_builtin_analysis( scale = "county", year = 2021, nutrients = "nitrogen", include_wwtp = TRUE, custom_wwtp_nitrogen = "nitrogen_2021.csv", wwtp_skip_rows = 3, # Skip first 3 rows wwtp_header_row = 1 # Headers in row 1 after skipping ) ``` #### Custom Column Names ```{r custom_columns, eval=FALSE} # If your columns have different names custom_mapping <- list( facility_name = "Plant_Name", latitude = "Lat_DD", longitude = "Long_DD", pollutant_load = "Annual_Load_kg", state = "State_Code" ) results <- run_builtin_analysis( scale = "huc8", year = 2022, nutrients = "nitrogen", include_wwtp = TRUE, custom_wwtp_nitrogen = "custom_format.csv", wwtp_column_mapping = custom_mapping ) ``` ### Manual WWTP Processing For full control, process WWTP data step by step: ```{r manual_wwtp, eval=FALSE} # Step 1: Load your WWTP file wwtp_raw <- load_user_wwtp( file_path = "nitrogen_2020.csv", nutrient = "nitrogen", load_units = "lbs" ) # Step 2: Clean the data wwtp_clean <- wwtp_clean_data(wwtp_raw, "nitrogen") # Step 3: Classify facilities by size wwtp_classified <- wwtp_classify_sources(wwtp_clean, "nitrogen") # Step 4: Convert to spatial format wwtp_spatial <- wwtp_to_spatial(wwtp_classified) # Step 5: Load boundaries and aggregate by spatial units boundaries <- load_builtin_boundaries("huc8") wwtp_aggregated <- wwtp_aggregate_by_boundaries( wwtp_spatial, boundaries, "nitrogen", "huc8" ) # Step 6: Integrate with agricultural data agri_data <- load_builtin_nugis("huc8", 2020) agri_processed <- agri_classify_complete(agri_data, "huc8") integrated <- integrate_wwtp_agricultural( agri_processed, wwtp_aggregated, "nitrogen", cropland_threshold = 1234, scale = "huc8" ) ``` ## Unit Conversions ### Common Conversions ```{r unit_conversions, eval=FALSE} # Convert between units kg_loads <- c(1000, 2000, 5000) tons_loads <- convert_load_units(kg_loads, "kg") lbs_loads <- c(2000, 4000, 10000) tons_loads <- convert_load_units(lbs_loads, "lbs") print(data.frame( Original_kg = kg_loads, Converted_tons = convert_load_units(kg_loads, "kg") )) ``` ### Handling P2O5 vs P The package automatically converts P2O5 to P, but you can do it manually: ```{r p2o5_conversion, eval=FALSE} # If you have P2O5 data, convert to P p2o5_values <- c(100, 200, 300) # kg P2O5 p_values <- p2o5_values * P2O5_TO_P # Convert to P print(paste("P2O5 to P conversion factor:", P2O5_TO_P)) ``` ## Working with Different Spatial Scales ### County Data (FIPS Codes) ```{r county_data, eval=FALSE} # County analysis - make sure you have 5-digit FIPS codes county_results <- run_builtin_analysis( scale = "county", year = 2016, nutrients = "nitrogen" ) # Your county WWTP data should have state and county info ``` ### HUC8 Watersheds ```{r huc8_data, eval=FALSE} # HUC8 analysis - 8-digit watershed codes huc8_results <- run_builtin_analysis( scale = "huc8", year = 2016, nutrients = "nitrogen" ) # Make sure HUC8 codes are 8 digits (add leading zero if needed) huc8_codes <- c("4110001", "4110002") # 7 digits formatted_codes <- format_huc8(huc8_codes) # Adds leading zero print(formatted_codes) # "04110001", "04110002" ``` ### HUC2 Regions ```{r huc2_data, eval=FALSE} # HUC2 analysis - 2-digit regional codes huc2_results <- run_builtin_analysis( scale = "huc2", year = 2016, nutrients = "nitrogen" ) ``` ## State-Specific Analysis ### Single State ```{r state_analysis, eval=FALSE} # Analyze just one state iowa_results <- run_state_analysis( state = "IA", scale = "county", year = 2016, nutrients = "nitrogen", include_wwtp = TRUE ) # With custom WWTP data texas_results <- run_state_analysis( state = "TX", scale = "huc8", year = 2020, nutrients = "nitrogen", include_wwtp = TRUE, custom_wwtp_nitrogen = "texas_wwtp_2020.csv" ) ``` ### Multiple States ```{r multi_state, eval=FALSE} # Analyze several states midwest_states <- c("IA", "IL", "IN", "OH") state_results <- list() for (state in midwest_states) { state_results[[state]] <- run_state_analysis( state = state, scale = "county", year = 2016, nutrients = "nitrogen", include_wwtp = TRUE ) } ``` ## Quality Control ### Validate Your Data ```{r data_validation2, eval=FALSE} # Check your results make sense quick_check(results) # Look at the classifications table(results$agricultural$N_class) # Check WWTP facility counts if ("wwtp" %in% names(results)) { print(paste("WWTP facilities:", nrow(results$wwtp$nitrogen$facility_data))) } ``` ### Common Data Issues ```{r data_issues, eval=FALSE} # Problem: Negative nutrient values # Solution: Check your data source and units # Problem: All facilities excluded # Solution: Check coordinate system (should be decimal degrees) # Problem: No WWTP facilities found # Solution: Check facility coordinates are in correct format # Problem: Classification doesn't make sense # Solution: Check cropland threshold and nutrient balance calculations ``` ## Working with Multiple Years ### Time Series Analysis ```{r time_series, eval=FALSE} # Analyze multiple years years_to_analyze <- 2014:2016 batch_results <- batch_analysis_years( years = years_to_analyze, scale = "huc8", nutrients = "nitrogen", include_wwtp = TRUE ) # With custom WWTP data for some years custom_wwtp_files <- list( "2018" = "nitrogen_2018.csv", "2019" = "nitrogen_2019.csv", "2020" = "nitrogen_2020.csv" ) # Process each year with custom data custom_results <- list() for (year in names(custom_wwtp_files)) { custom_results[[year]] <- run_builtin_analysis( scale = "county", year = as.numeric(year), nutrients = "nitrogen", include_wwtp = TRUE, custom_wwtp_nitrogen = custom_wwtp_files[[year]] ) } ``` ## Data Management Tips ### File Organization ```{r , eval=FALSE} # Organize your data files # project_folder/ # ├── wwtp_data/ # │ ├── nitrogen_2018.csv # │ ├── nitrogen_2019.csv # │ └── phosphorus_2018.csv # ├── results/ # └── maps/ # Set working directory setwd("project_folder") # Run analysis with organized files results <- run_builtin_analysis( scale = "huc8", year = 2018, nutrients = "nitrogen", include_wwtp = TRUE, custom_wwtp_nitrogen = "wwtp_data/nitrogen_2018.csv", output_dir = "results" ) ``` ### Memory Management ```{r memory_management, eval=FALSE} # For large datasets, clear cache periodically clear_data_cache() # Check package health health_check() # Free up memory between analyses rm(large_results) gc() ``` ## Example: Complete Custom Workflow Here's a complete example using custom data: ```{r complete_example, eval=FALSE} # 1. Prepare your WWTP file (nitrogen_2021.csv) # Make sure it has columns: Facility Name, Latitude, Longitude, Load (kg/yr), State # 2. Run analysis results_custom <- run_builtin_analysis( scale = "huc8", year = 2021, # Agricultural data extrapolated to 2021 nutrients = "nitrogen", include_wwtp = TRUE, custom_wwtp_nitrogen = "nitrogen_2021.csv", wwtp_load_units = "kg" ) # 3. Check results summarize_results(results_custom) quick_check(results_custom) # 4. Create maps nitrogen_map <- map_agricultural_classification( results_custom$integrated$nitrogen, "nitrogen", "combined_N_class", "2021 Nitrogen with Custom WWTP Data" ) save_plot(nitrogen_map, "custom_analysis_2021.png") # 5. Save results save_spatial_data( results_custom$integrated$nitrogen, "nitrogen_2021_results.rds" ) ``` ### Integration Issues ```{r troubleshooting_integration, eval=FALSE} # Common integration problems # Problem: WWTP facilities not matching spatial units # Solution: Check coordinate systems and boundary files check_wwtp_spatial_match <- function(wwtp_data, boundaries) { # Convert WWTP to spatial wwtp_sf <- st_as_sf(wwtp_data, coords = c("Long", "Lat"), crs = 4326) wwtp_projected <- st_transform(wwtp_sf, st_crs(boundaries)) # Check spatial intersection intersections <- st_intersects(wwtp_projected, boundaries) # Count facilities per spatial unit matches <- lengths(intersections) message("WWTP spatial matching summary:") message(" Total facilities: ", nrow(wwtp_data)) message(" Facilities matched to boundaries: ", sum(matches > 0)) message(" Facilities not matched: ", sum(matches == 0)) if (sum(matches == 0) > 0) { unmatched <- wwtp_data[matches == 0, ] message(" Check coordinates for unmatched facilities") print(head(unmatched)) } return(matches) } # Problem: Scale mismatch between data sources # Solution: Ensure consistent spatial scales verify_scale_consistency <- function(agricultural_data, wwtp_data, scale) { message("Scale consistency check for: ", scale) # Check ID formats if (scale == "county") { # FIPS codes should be 5 digits invalid_fips <- sum(nchar(agricultural_data$ID) != 5) message(" Invalid FIPS codes: ", invalid_fips) } else if (scale == "huc8") { # HUC8 codes should be 8 digits invalid_huc8 <- sum(nchar(agricultural_data$ID) != 8) message(" Invalid HUC8 codes: ", invalid_huc8) } # Check coordinate ranges coord_summary <- list( lat_range = range(wwtp_data$Lat, na.rm = TRUE), long_range = range(wwtp_data$Long, na.rm = TRUE) ) message(" WWTP coordinate ranges:") message(" Latitude: ", paste(round(coord_summary$lat_range, 2), collapse = " to ")) message(" Longitude: ", paste(round(coord_summary$long_range, 2), collapse = " to ")) return(coord_summary) } ``` ## Best Practices for Custom Data ### File Organization ```{r file_organization, eval=FALSE} # Recommended project structure create_project_structure <- function(project_name) { # Create main directories dir.create(project_name, showWarnings = FALSE) dir.create(file.path(project_name, "data"), showWarnings = FALSE) dir.create(file.path(project_name, "data", "wwtp"), showWarnings = FALSE) dir.create(file.path(project_name, "data", "agricultural"), showWarnings = FALSE) dir.create(file.path(project_name, "results"), showWarnings = FALSE) dir.create(file.path(project_name, "maps"), showWarnings = FALSE) dir.create(file.path(project_name, "exports"), showWarnings = FALSE) # Create README readme_content <- paste( "# Manureshed Analysis Project", "", "## Data Files", "- data/wwtp/ - WWTP facility data files", "- data/agricultural/ - Custom agricultural data", "", "## Outputs", "- results/ - Analysis results (.rds files)", "- maps/ - Generated maps (.png files)", "- exports/ - Exported data (.csv, .shp files)", "", "## Analysis Scripts", "- analysis.R - Main analysis script", "", sep = "\n" ) writeLines(readme_content, file.path(project_name, "README.md")) message("Project structure created: ", project_name) message("Add your data files to the appropriate folders") } # Create organized project # create_project_structure("my_nutrient_analysis") ``` ### Data Documentation ```{r data_documentation, eval=FALSE} # Document your custom data sources document_data_sources <- function(wwtp_files = NULL, agricultural_files = NULL, output_file = "data_documentation.txt") { doc_content <- c( "DATA SOURCES DOCUMENTATION", "============================", "", "Analysis Date: ", as.character(Sys.Date()), "Package Version: ", as.character(packageVersion("manureshed")), "" ) if (!is.null(wwtp_files)) { doc_content <- c(doc_content, "WWTP DATA FILES:", "----------------" ) for (file in wwtp_files) { if (file.exists(file)) { file_info <- file.info(file) doc_content <- c(doc_content, paste("File:", file), paste(" Size:", round(file_info$size / 1024, 2), "KB"), paste(" Modified:", file_info$mtime), paste(" Rows:", nrow(read.csv(file))), "" ) } } } if (!is.null(agricultural_files)) { doc_content <- c(doc_content, "AGRICULTURAL DATA FILES:", "------------------------" ) for (file in agricultural_files) { if (file.exists(file)) { file_info <- file.info(file) doc_content <- c(doc_content, paste("File:", file), paste(" Size:", round(file_info$size / 1024, 2), "KB"), paste(" Modified:", file_info$mtime), paste(" Rows:", nrow(read.csv(file))), "" ) } } } doc_content <- c(doc_content, "ANALYSIS PARAMETERS:", "--------------------", "Built-in data years: 1987-2016 (Agricultural), 2007-2016 (WWTP)", "Spatial scales: county, huc8, huc2", "Default cropland threshold: 500 ha (1,234 acres)", "" ) writeLines(doc_content, output_file) message("Data documentation saved to: ", output_file) } # Use documentation function # document_data_sources( # wwtp_files = c("data/wwtp/nitrogen_2020.csv", "data/wwtp/phosphorus_2020.csv"), # agricultural_files = c("data/agricultural/custom_farm_data.csv") # ) ``` ### Quality Assurance Workflow ```{r qa_workflow, eval=FALSE} # Complete quality assurance workflow quality_assurance_workflow <- function(results, data_sources = NULL) { qa_report <- list() qa_report$timestamp <- Sys.time() qa_report$data_sources <- data_sources # 1. Basic data checks qa_report$basic_checks <- list( total_spatial_units = nrow(results$agricultural), missing_classifications = sum(is.na(results$agricultural$N_class)) ) # 2. Classification distribution if ("N_class" %in% names(results$agricultural)) { n_dist <- table(results$agricultural$N_class) qa_report$classification_distribution <- list( nitrogen = n_dist, excluded_percentage = round(n_dist["Excluded"] / sum(n_dist) * 100, 1) ) } # 3. WWTP integration checks if ("wwtp" %in% names(results)) { qa_report$wwtp_checks <- list() for (nutrient in names(results$wwtp)) { facilities <- results$wwtp[[nutrient]]$facility_data qa_report$wwtp_checks[[nutrient]] <- list( total_facilities = nrow(facilities), facilities_with_loads = sum(!is.na(facilities[[paste0(toupper(substr(nutrient, 1, 1)), "_Load_tons")]])) ) } } # 4. Spatial validation if (inherits(results$agricultural, "sf")) { qa_report$spatial_checks <- list( invalid_geometries = sum(!st_is_valid(results$agricultural)), crs = st_crs(results$agricultural)$input ) } # 5. Range checks if ("integrated" %in% names(results)) { for (nutrient in names(results$integrated)) { data <- results$integrated[[nutrient]] surplus_col <- paste0(substr(nutrient, 1, 1), "_surplus") if (surplus_col %in% names(data)) { qa_report$range_checks[[nutrient]] <- list( min_surplus = min(data[[surplus_col]], na.rm = TRUE), max_surplus = max(data[[surplus_col]], na.rm = TRUE), extreme_values = sum(abs(data[[surplus_col]]) > 1000, na.rm = TRUE) ) } } } # Generate QA summary qa_summary <- list( passed = TRUE, warnings = character(), errors = character() ) # Check for issues if (qa_report$basic_checks$missing_classifications > 0) { qa_summary$warnings <- c(qa_summary$warnings, paste("Missing classifications:", qa_report$basic_checks$missing_classifications)) } if (qa_report$classification_distribution$excluded_percentage > 50) { qa_summary$warnings <- c(qa_summary$warnings, paste("High exclusion rate:", qa_report$classification_distribution$excluded_percentage, "%")) } if ("spatial_checks" %in% names(qa_report) && qa_report$spatial_checks$invalid_geometries > 0) { qa_summary$errors <- c(qa_summary$errors, paste("Invalid geometries:", qa_report$spatial_checks$invalid_geometries)) qa_summary$passed <- FALSE } # Print summary message("Quality Assurance Summary:") message(" Status: ", ifelse(qa_summary$passed, "PASSED", "FAILED")) if (length(qa_summary$warnings) > 0) { message(" Warnings:") for (warning in qa_summary$warnings) { message(" - ", warning) } } if (length(qa_summary$errors) > 0) { message(" Errors:") for (error in qa_summary$errors) { message(" - ", error) } } qa_report$summary <- qa_summary return(qa_report) } # Run QA workflow # qa_results <- quality_assurance_workflow(results_custom, "Custom 2021 WWTP data") ``` ## Getting Help ### Common Questions **Q: What format should my WWTP data be in?** A: CSV file with columns for facility name, latitude, longitude, annual load, and state. The package can auto-detect most EPA formats. **Q: What units can I use for loads?** A: "kg", "lbs", "pounds", or "tons". Specify with `wwtp_load_units`. **Q: Can I use data from other countries?** A: The spatial boundaries are US-only, but you could provide your own boundaries and modify the coordinate system. **Q: How do I handle missing data?** A: The package excludes facilities with missing coordinates or loads. Clean your data first for best results. **Q: My analysis is running slowly. What can I do?** A: Try using a smaller spatial scale (HUC2 < HUC8 < County in terms of processing time), fewer years, or clear the data cache with `clear_data_cache()`. **Q: How do I cite the package and data?** A: Use `citation_info()` to get proper citations for the package and underlying datasets. **Q: Can I analyze partial years or seasonal data?** A: The package is designed for annual data. For seasonal analysis, you would need to aggregate your data to annual totals first. **Q: What if my WWTP data has different pollutant measurements?** A: The package expects total nitrogen and total phosphorus loads. If you have other forms (NO3, NH4, PO4), you'll need to convert to total N and P first. ### Function Help ```{r , eval=FALSE} # Get help on specific functions ?load_user_wwtp ?run_builtin_analysis ?wwtp_clean_data ?validate_columns # Check what data is available check_builtin_data() list_available_years() # Package health check health_check() # Test data download connection test_osf_connection() ``` ### Getting More Help - Check the other vignettes: `vignette("getting-started")`, `vignette("visualization-guide")`, `vignette("advanced-features")` - Look at function documentation: `?function_name` - Check the package website or GitHub repository for examples - Use `quick_check()` to validate your results - Run `health_check()` if you encounter installation or data loading issues ### Reporting Issues If you encounter bugs or have feature requests: 1. Check that your data format matches the expected format 2. Run `health_check()` to verify package installation 3. Try with built-in data first to isolate the issue 4. Document the error message and steps to reproduce 5. Check the package documentation and vignettes 6. Report issues with a minimal reproducible example The `manureshed` package is designed to work with your data as easily as possible. Start with the built-in examples, then gradually add your own data as needed. The key is to ensure your data is properly formatted and validated before running the analysis.