Databricks Cheat Sheets

Databricks Notebook Commands Command Purpose Example %config Set configuration options for the notebook %env Set environment variables %fs Interact with the Databricks file system %fs ls dbfs:/repo %load Loads the contents of a file into a cell %lsmagic List all magic commands %jobs Lists all running jobs %matplotlib sets up the matplotlib backend %md Write Markdown text %pip Install Python packages %python Executes python code %python dbutils.fs.rm("/user/hive/warehouse/test/", True) %r Execute R code %reload reloads module contents %run Executes a Python file or a notebook %scala Executes scala code %sh Executes shell commands on the cluster nodes %sh git clone https://github.com/repo/test %sql Executes SQL queries %who Lists all the variables in the current scope Accessing Files /path/to/file dbfs:/path/to/file file:/path/to/file s3://path/to/file Copying Files %fs cp file:/<path> /Volumes/<catalog>/<schema>/<volume>/<path> %python dbutils.fs.cp("file:/<path>", "/Volumes/<catalog>/<schema>/<volume>/<path>") %python dbutils.fs.cp("file:/databricks/driver/test", "dbfs:/repo", True) %sh cp /<path> /Volumes/<catalog>/<schema>/<volume>/<path> SQL Statements (DDL) Create & Use Schema CREATE SCHEMA test; CREATE SCHEMA custom LOCATION 'dbfs:/custom'; USE SCHEMA test; Create Table CREATE TABLE test(col1 INT, col2 STRING, col3 STRING, col4 BIGINT, col5 INT, col6 FLOAT); CREATE TABLE test AS SELECT * EXCEPT (_rescued_data) FROM read_files('/repo/data/test.csv'); CREATE TABLE test USING CSV LOCATION '/repo/data/test.csv'; CREATE TABLE test USING CSV OPTIONS (header="true") LOCATION '/repo/data/test.csv'; CREATE TABLE test AS SELECT * EXCEPT (_rescued_data) FROM read_files('/repo/data/test.csv'); CREATE TABLE test AS ... CREATE TABLE test USING ... CREATE TABLE test(id INT, title STRING, col1 STRING, publish_time BIGINT, pages INT, price FLOAT) COMMENT 'This is comment for the table itself'; CREATE TABLE test AS SELECT * EXCEPT (_rescued_data) FROM read_files('/repo/data/test.json', format => 'json'); CREATE TABLE test_raw AS SELECT * EXCEPT (_rescued_data) FROM read_files('/repo/data/test.csv', sep => ';'); CREATE TABLE custom_table_test LOCATION 'dbfs:/custom-table' AS SELECT * EXCEPT (_rescued_data) FROM read_files('/repo/data/test.csv'); CREATE TABLE test PARTITIONED BY (col1) AS SELECT * EXCEPT (_rescued_data) FROM read_files('/repo/data/test.csv') CREATE TABLE users( firstname STRING, lastname STRING, full_name STRING GENERATED ALWAYS AS (concat(firstname, ' ', lastname)) ); CREATE OR REPLACE TABLE test AS SELECT * EXCEPT (_rescued_data) FROM read_files('/repo/data/test.csv'); CREATE OR REPLACE TABLE test AS SELECT * FROM json.`/repo/data/test.json`; CREATE OR REPLACE TABLE test AS SELECT * FROM read_files('/repo/data/test.csv'); Create View CREATE VIEW view_test AS SELECT * FROM test WHERE col1 = 'test'; CREATE VIEW view_test AS SELECT col1, col1 FROM test JOIN test2 ON test.col2 == test2.col2; CREATE TEMP VIEW temp_test AS SELECT * FROM test WHERE col1 = 'test'; CREATE TEMP VIEW temp_test AS SELECT * FROM read_files('/repo/data/test.csv'); CREATE GLOBAL TEMP VIEW view_test AS SELECT * FROM test WHERE col1 = 'test'; SELECT * FROM global_temp.view_test; CREATE TEMP VIEW jdbc_example USING JDBC OPTIONS ( url "<jdbc-url>", dbtable "<table-name>", user '<username>', password '<password>'); CREATE OR REPLACE TEMP VIEW test AS SELECT * FROM delta.`<logpath>`; CREATE VIEW event_log_raw AS SELECT * FROM event_log("<pipeline-id>"); CREATE OR REPLACE TEMP VIEW test_view AS SELECT test.col1 AS col1 FROM test_table WHERE col1 = 'value1' ORDER BY timestamp DESC LIMIT 1; Drop DROP TABLE test; Describe SHOW TABLES; DESCRIBE EXTENDED test; SQL Statements (DML) Select SELECT * FROM csv.`/repo/data/test.csv`; SELECT * FROM read_files('/repo/data/test.csv'); SELECT * FROM read_files('/repo/data/test.csv', format => 'csv', header => 'true', sep => ',') SELECT * FROM json.`/repo/data/test.json`; SELECT * FROM json.`/repo/data/*.json`; SELECT * FROM test WHERE year(from_unixtime(test_time)) > 1900; SELECT * FROM test WHERE title LIKE '%a%' SELECT * FROM test WHERE title LIKE 'a%' SELECT * FROM test WHERE title LIKE '%a' SELECT * FROM test TIMESTAMP AS OF '2024-01-01T00:00:00.000Z'; SELECT * FROM test VERSION AS OF 2; SELECT * FROM test@v2; SELECT * FROM event_log("<pipeline-id>"); SELECT count(*) FROM VALUES (NULL), (10), (10) AS example(col); SELECT count(col) FROM VALUES (NULL), (10), (10) AS example(col); SELECT count_if(col1 = 'test') FROM test; SELECT from_unixtime(test_time) FROM test; SELECT cast(test_time / 1 AS timestamp) FROM test; SELECT cast(cast(test_time AS BIGINT) AS timestamp) FROM test; SELECT element.sub_element FROM test; SELECT flatten(array(array(1, 2), array(3, 4))); SELECT * FROM ( SELECT col1, col2 FROM test ) PIVOT ( sum(col1) for col2 in ('item1','item2') ); SELECT *, CASE WHEN col1 > 10 THEN 'value1' ELSE 'value2' END FROM test; SELECT * FROM test ORDER BY (CASE WHEN col1 > 10 THEN col2 ELSE col3 END); WITH t(col1, col2) AS (SELECT 1, 2) SELECT * FROM t WHERE col1 = 1; SELECT details:flow_definition.output_dataset as output_dataset, details:flow_definition.input_datasets as input_dataset FROM event_log_raw, latest_update WHERE event_type = 'flow_definition' AND origin.update_id = latest_update.id; Insert INSERT OVERWRITE test SELECT * FROM read_files('/repo/data/test.csv'); INSERT INTO test(col1, col2) VALUES ('value1', 'value2'); Merge Into MERGE INTO test USING test_to_delete ON test.col1 = test_to_delete.col1 WHEN MATCHED THEN DELETE; MERGE INTO test USING test_to_update ON test.col1 = test_to_update.col1 WHEN MATCHED THEN UPDATE SET *; MERGE INTO test USING test_to_insert ON test.col1 = test_to_insert.col1 WHEN NOT MATCHED THEN INSERT *; Copy Into COPY INTO test FROM '/repo/data' FILEFORMAT = CSV FILES = ('test.csv') FORMAT_OPTIONS('header' = 'true', 'inferSchema' = 'true'); Delta Lake Statements DESCRIBE HISTORY test; DESCRIBE HISTORY test LIMIT 1; INSERT INTO test SELECT * FROM test@v2 WHERE id = 3; OPTIMIZE test; OPTIMIZE test ZORDER BY col1; RESTORE TABLE test TO VERSION AS OF 0; SELECT * FROM test TIMESTAMP AS OF '2024-01-01T00:00:00.000Z'; SELECT * FROM test VERSION AS OF 2; SELECT * FROM test@v2; VACUUM test; VACUUM test RETAIN 240 HOURS; %fs ls dbfs:/user/hive/warehouse/test/_delta_log %python spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false") Delta Live Table Statements CREATE OR REFRESH LIVE TABLE test_raw AS SELECT * FROM json.`/repo/data/test.json`; CREATE OR REFRESH STREAMING TABLE test AS SELECT * FROM STREAM read_files('/repo/data/test*.json'); CREATE OR REFRESH LIVE TABLE test_cleaned AS SELECT col1, col2, col3, col4 FROM live.test_raw; CREATE OR REFRESH LIVE TABLE recent_test AS SELECT col1, col2 FROM live.test2 ORDER BY creation_time DESC LIMIT 10; Fuctions CREATE OR REPLACE FUNCTION test_function(temp DOUBLE) RETURNS DOUBLE RETURN (col1 - 10); Auto Loader %python spark.readStream.format("cloudFiles")\ .option("cloudFiles.format", "json")\ .option("cloudFiles.schemaLocation", "/autoloader-schema")\ .option("pathGlobFilter", "test*.json")\ .load("/repo/data")\ .writeStream\ .option("mergeSchema", "true")\ .option("checkpointLocation", "/autoloader-checkpoint")\ .start("demo") %fs head /autoloader-schema/_schemas/0 CREATE OR REFRESH STREAMING TABLE test AS SELECT * FROM cloud_files( '/repo/data', 'json', map("cloudFiles.inferColumnTypes", "true", "pathGlobFilter", "test*.json") ); CONSTRAINT positive_timestamp EXPECT (creation_time > 0) CONSTRAINT positive_timestamp EXPECT (creation_time > 0) ON VIOLATION DROP ROW CONSTRAINT positive_timestamp EXPECT (creation_time > 0) ON VIOLATION FAIL UPDATE CDC Statements APPLY CHANGES INTO live.target FROM stream(live.cdc_source) KEYS (col1) APPLY AS DELETE WHEN col2 = "DELETE" SEQUENCE BY col3 COLUMNS * EXCEPT (col); Security Statements GRANT <privilege> ON <object_type> <object_name> TO <user_or_group>; GRANT SELECT ON TABLE test TO `databricks@degols.net`; REVOKE <privilege> ON <object_type> <object_name> FROM `test@gmail.com'; Links Databricks SQL Language Reference Cheat Sheets Compute creation cheat sheet Platform administration cheat sheet Production job scheduling cheat sheet Best Practices Delta Lake best practices Hyperparameter tuning with Hyperopt Deep learning in Databricks Recommendations for MLOps Unity Catalog best practices Cluster configuration best practices Instance pool configuration best practices Other Databricks Cheat Sheet 1 Databricks Notebook Markdown Cheat Sheet

List of Data Engineering & Data Science Courses

Data Engineering A Cloud Guru Apache Kafka Deep Dive AWS Certified Big Data Specialty Google Certified Professional Data Engineer Microsoft Certified: Azure Data Engineer Associate (DP-203) Coursera Introduction to Data Engineering DataCamp Building Data Engineering Pipelines in Python Database Design ETL in Python Introduction to Airflow in Python Introduction to Data Engineering NoSQL Concepts Streaming Concepts Understanding Data Engineering Google Building Batch Data Pipelines on Google Cloud Building Resilient Streaming Analytics Systems on Google Cloud Modernizing Data Lakes and Data Warehouses with Google Cloud Preparing for the Google Cloud Professional Data Engineer Exam Serverless Data Processing with Dataflow: Develop Pipelines Serverless Data Processing with Dataflow: Foundations Serverless Data Processing with Dataflow: Operations Udacity How to Become a Data Engineer Udemy Taming Big Data with Apache Spark and Python - Hands On! Whizlabs Apache Kafka Fundamentals Databricks Certified Associate Developer for Apache Spark (Python) Databricks Certified Data Analyst Associate Certification Databricks Certified Data Engineer Associate Certification Databricks Certified Data Engineer Professional Certification Snowflake SnowPro Core Certification Data Science A Cloud Guru Introduction to Machine Learning Coursera Data Science with Databricks for Data Analysts Specialization DataCamp Introduction to Data Science in Python Python Data Science Toolbox (Part 1) Google Data Science Foundations Data Science with Python Google Cloud Big Data and Machine Learning Fundamentals Intro to TensorFlow for Deep Learning Learn Python basics for data analysis Machine Learning Crash Course Smart Analytics, Machine Learning, and AI on Google Cloud Udemy AWS Certified Machine Learning Specialty 2023 - Hands On! Whizlabs AWS Certified Machine Learning Specialty Databricks Certified Machine Learning Associate Certification Databricks Certified Machine Learning Professional Certification Introduction to Data Science with Python TensorFlow for Deep Learning with Python

Mac Applications & Utilities

Mac Applications A list of recommended Mac applications … Microsoft 365 Excel OneDrive OneNote Outlook Powerpoint Teams Word MindNode - capture, organize, style & share your thoughts Notion - single space where you can think, write & plan Obsidian - private and flexible writing app that adapts to the way you think Parallels - add Windows, Linux, or another operating system to your Mac PyCharm - Python IDE Slack - messaging app Mac Utilities A list of recommended Mac utilities … ...

Music Production: MPE Controllers

About MIDI Polyphonic Expression (MPE) is a method of using MIDI that enables pitch bend, and other dimensions of expressive control, to be adjusted continuously for individual notes. MPE Controllers embodme ERAE Touch - born out of a desire to combine both sensitivity and versatility in a seamless object Keith McMillen QuNeo - MPE MIDI Finger Drum Controller Roger Linn Design LinnStrument - expressive MIDI controller for musical performance Roli LUMI Keys Studio Edition - world’s first keyboard controller with per-key pitchbend and polyphonic aftertouch, plus whole-key illumination that sparks new ideas for playing and composing Seaboard BLOCK M - compact and powerful, play any sound imaginable and express yourself effortlessly — wherever creativity finds you — on this evolution of the multi-award-winning original Seaboard RISE 2 - a keyboard designed for limitless exploration of sound Yamaha SEQTRAK - ultimate music creation station that empowers you to capture, create and connect with fans YouTube Videos ERAE Touch LinnStrument LUMI Keys Studio Edition QuNeo MPE Seaboard BLOCK M Seaboard RISE 2 SEQTRAK

Music Production: Physical Modeling

About Physical modelling is a process of recreating the physical reactions and unique interplay of both physical and electronic components using mathematical models and algorithms to create a realistic reproduction. Physically modelled instruments simulate the playability and complex behaviour of real acoustic instruments. Because there are no samples, the file size of these plugins are just a tiny fraction of that required by other virtual instruments. This is an excellent overview of physical modeling compared to sampling; ...

Music Production: NAMM 2024

About The NAMM Show is an annual event in the United States that is organized by the National Association of Music Merchants (NAMM), who describe it as “the industry’s largest stage, uniting the global music, sound and entertainment technology communities”. Homepage Inaugurated: January 1902 YouTube Videos 10 Crazy & CREATIVE things at NAMM 2024 20 AMAZING things at NAMM 2024 Doctor Mix At NAMM 2024: Synthesizer Supervlog Jordan Rudess Interview NAMM 2024 Jordan Rudess Demos The Pocket Piano KORG Grandstage X Stage Piano Korg’s Latest Gear Korg microKORG2 Korg PS-3300 Demo & Overview Korg PS-3300 FS Deep Dive at NAMM 2024 Nord Grand 2 Oberheim Inspired Analog Synth - Shear Electronics Relic | NAMM 2024 PWM - Mantis Studiologic at NAMM 2024: Justin-Lee Schultz Teenage Engineering - K.O. II - OB4 Yamaha - Montage M ESP Yamaha - Seqtrak

January 28, 2024 1 min

Music Production: Mobile Apps

iPad & iPhone Apps Recommended iPad & iPhone apps … Ableton Ableton Note - A place to start musical ideas AudioKit Pro AudioKit Synth One Synthesizer Jam with Jordan King of Digital | Hybrid Synth King of FM: DX Synth/E Piano LE05: Digitalism 2000 + AUv3 SAGA Synth | 16-Bit Super Fun! KORG ARP ODYSSEi - The ARP Odyssey of 1972 has earned an unshakable position as a classic analog synthesizer. In 2015, KORG produced the ARP ODYSSEY, a complete reproduction of the original KORG ELECTRIBE Wave - Groovebox KORG Gadget 3 - A Complete, Seamlessly Integrated Music Production Ecosystem KORG iMono/Poly - Synthesizer / Keyboard KORG iWAVESTATION - Synthesizer KORG Module Pro - High resolution Piano/Keyboard KV331 Audio SynthMaster One Modartt Pianoteq 8 - Pianoteq is an award-winning virtual instrument that can simulate many instruments (Pianos, Electric Pianos, Harpsichords, Vibraphones, Guitar, Steeldrums and more) thanks to physical modelling Roland Roland Zenbeats - Roland Zenbeats is a music creation app that keeps you in an effortless artistic flow. Sincere Apps Piano Chords and Scales - Play, learn theory, make songs iPhone Apps Recommended iPhone apps … ...

Music Production Blogs

Bedroom Producers Blog (bpb) logicxx Native Instruments Point Blank Music School Sound On Sound WA Production

Music Production Courses

Ableton Ableton Certified Training ADSR Ask.Video Coursera Groove3 London Academy of Music Production MacProVideo Point Blank Music School Producertech Skillshare Udemy Tomas George - music producer, composer & audio engineer Ableton Live 11 - Music Production in Ableton Live - learn music production, recording, editing, music mixing, songwriting & music theory Cubase Ask.Video Groove3 London Academy of Music Production Sonic Academy Steinberg Logic Pro Udemy Tomas George - music producer, composer & audio engineer Music Production in Logic Pro X - The Beginners Guide! - learn the basics of setting up, creating music & music production in Logic Pro X Music Production in Logic Pro X - The Complete Course! - creating, recording, mixing music & mastering in Logic Pro X Maschine ADSR Ask.Video MacProVideo Maschine Tutorials Producertech Masterclass Armin van Buuren Teaches Dance Music deadmau5 Teaches Electronic Music Production Timbaland Teaches Producing and Beatmaking Mix With The Masters Learn the entire process of music production, mixing and mastering from the world’s top engineers and producers through seminars and online videos ...

Music Production Websites

Ask.Video - Ableton, Cubase, Logic Pro, Native Instruments … Audio Plugin Guy - plugin deals Born to Produce Coursera Groove3 - Ableton, BitWig Studio, Cubase, FL Studio, Logic Pro, Maschine … London Academy of Music Production MacProVideo Point Blank Music School Producertech Skillshare Sonic Academy Steinberg Udemy